commit 80333dff74cfd162657532117e5df2c27e71aac2 Author: lingyu123-su <3307872825@qq.com> Date: Mon Sep 8 14:49:28 2025 +0800 first commit diff --git a/Amadeus/.DS_Store b/Amadeus/.DS_Store new file mode 100644 index 0000000..5df37c8 Binary files /dev/null and b/Amadeus/.DS_Store differ diff --git a/Amadeus/__init__.py b/Amadeus/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Amadeus/__pycache__/__init__.cpython-310.pyc b/Amadeus/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..d9d05ca Binary files /dev/null and b/Amadeus/__pycache__/__init__.cpython-310.pyc differ diff --git a/Amadeus/__pycache__/evaluation_utils.cpython-310.pyc b/Amadeus/__pycache__/evaluation_utils.cpython-310.pyc new file mode 100644 index 0000000..3ad2148 Binary files /dev/null and b/Amadeus/__pycache__/evaluation_utils.cpython-310.pyc differ diff --git a/Amadeus/__pycache__/model_zoo.cpython-310.pyc b/Amadeus/__pycache__/model_zoo.cpython-310.pyc new file mode 100644 index 0000000..632bb87 Binary files /dev/null and b/Amadeus/__pycache__/model_zoo.cpython-310.pyc differ diff --git a/Amadeus/__pycache__/sampling_utils.cpython-310.pyc b/Amadeus/__pycache__/sampling_utils.cpython-310.pyc new file mode 100644 index 0000000..60d2121 Binary files /dev/null and b/Amadeus/__pycache__/sampling_utils.cpython-310.pyc differ diff --git a/Amadeus/__pycache__/sub_decoder_utils.cpython-310.pyc b/Amadeus/__pycache__/sub_decoder_utils.cpython-310.pyc new file mode 100644 index 0000000..c3826ba Binary files /dev/null and b/Amadeus/__pycache__/sub_decoder_utils.cpython-310.pyc differ diff --git a/Amadeus/__pycache__/sub_decoder_zoo.cpython-310.pyc b/Amadeus/__pycache__/sub_decoder_zoo.cpython-310.pyc new file mode 100644 index 0000000..40dbe19 Binary files /dev/null and b/Amadeus/__pycache__/sub_decoder_zoo.cpython-310.pyc differ diff --git a/Amadeus/__pycache__/train_utils.cpython-310.pyc b/Amadeus/__pycache__/train_utils.cpython-310.pyc new file mode 100644 index 0000000..104f114 Binary files /dev/null and b/Amadeus/__pycache__/train_utils.cpython-310.pyc differ diff --git a/Amadeus/__pycache__/transformer_utils.cpython-310.pyc b/Amadeus/__pycache__/transformer_utils.cpython-310.pyc new file mode 100644 index 0000000..6332188 Binary files /dev/null and b/Amadeus/__pycache__/transformer_utils.cpython-310.pyc differ diff --git a/Amadeus/catsample.py b/Amadeus/catsample.py new file mode 100644 index 0000000..13f0ebe --- /dev/null +++ b/Amadeus/catsample.py @@ -0,0 +1,56 @@ +import torch +import torch.nn.functional as F + + +def gumbel_softmax(categorical_probs, hard=False, eps=1e-9): + logits = categorical_probs.clamp(min=1e-9).log() + return F.gumbel_softmax(logits, hard=hard) + + +def sample_categorical(categorical_probs, method="hard"): + if method == "hard": + gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log() + return (categorical_probs / gumbel_norm).argmax(dim=-1) + else: + raise ValueError(f"Method {method} for sampling categorical variables is not valid.") + +def direct_sampling(logits): + probs = logits.softmax(dim=-1) + index = sample_categorical(probs.to(torch.float32)) + return index + + +def top_p_sampling(logits, p=0.9): + probs = logits.softmax(dim=-1) + + sorted_probs, sorted_indices = torch.sort(probs, descending=True) + cumulative_probs = torch.cumsum(sorted_probs, dim=-1) + sorted_indices_to_remove = cumulative_probs > p + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + probs.masked_fill_(indices_to_remove, 0) + probs /= probs.sum(dim=-1).unsqueeze(-1) + index = sample_categorical(probs.to(torch.float32)) + + return index + + +def top_k_sampling(logits, k=400): + top_k_values, top_k_indices = torch.topk(logits, int(k)) + top_k_probs = top_k_values.softmax(dim=-1) + index = sample_categorical(top_k_probs.to(torch.float32)) + index = top_k_indices[torch.arange(index.size(0)), index] + + return index + +def sample_with_strategy(update_logits, strategy, para = None): + if strategy == "direct": + return direct_sampling(update_logits) + elif strategy == "top_p": + return top_p_sampling(update_logits, para) + elif strategy == "top_k": + return top_k_sampling(update_logits, para) + else: + raise ValueError(f"Strategy {strategy} is not valid.") \ No newline at end of file diff --git a/Amadeus/evaluation_utils.py b/Amadeus/evaluation_utils.py new file mode 100644 index 0000000..00aa52b --- /dev/null +++ b/Amadeus/evaluation_utils.py @@ -0,0 +1,533 @@ +from collections import defaultdict +from typing import Union +from math import log +from omegaconf import DictConfig +from pathlib import Path +import pickle +import json +import torch +from tqdm.auto import tqdm +from transformers import T5Tokenizer, T5EncoderModel + +from . import model_zoo +from .symbolic_encoding import data_utils +from .model_zoo import AmadeusModel +from .symbolic_encoding.data_utils import TuneCompiler +from .symbolic_encoding.compile_utils import shift_and_pad +from .symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor +from .symbolic_encoding import decoding_utils +from .train_utils import adjust_prediction_order +from data_representation import vocab_utils +from data_representation.vocab_utils import LangTokenVocab + +def wandb_style_config_to_omega_config(wandb_conf): + # remove wandb related config + for wandb_key in ["wandb_version", "_wandb"]: + if wandb_key in wandb_conf: + del wandb_conf[wandb_key] # wandb-related config should not be overrided! + # print(wandb_conf) + # remove nonnecessary fields such as desc and value + for key in wandb_conf: + # if 'desc' in wandb_conf[key]: + # del wandb_conf[key]['desc'] + if isinstance(wandb_conf[key], dict) and 'value' in wandb_conf[key]: + wandb_conf[key] = wandb_conf[key]['value'] + # 处理存在'value'的情况 + try: + if 'value' in wandb_conf[key]: + wandb_conf[key] = wandb_conf[key]['value'] + except: + pass + return wandb_conf + +def get_dir_from_wandb_by_code(wandb_dir: Path, code:str) -> Path: + for dir in wandb_dir.iterdir(): + if dir.name.endswith(code): + return dir + print(f'No such code in wandb_dir: {code}') + return None + +def get_best_ckpt_path_and_config(wandb_dir, code): + dir = get_dir_from_wandb_by_code(wandb_dir, code) + if dir is None: + raise ValueError('No such code in wandb_dir') + ckpt_dir = dir / 'files' / 'checkpoints' + + config_path = dir / 'files' / 'config.yaml' + # print all files in ckpt_dir + vocab_path = next(ckpt_dir.glob('vocab*')) + metadata_path = next(ckpt_dir.glob('*metadata.json')) + + # if there is pt file ending with 'last', return it + if len(list(ckpt_dir.glob('*last.pt'))) > 0: + last_ckpt_fn = next(ckpt_dir.glob('*last.pt')) + else: + pt_fns = sorted(list(ckpt_dir.glob('*.pt')), key=lambda fn: int(fn.stem.split('_')[0].replace('iter', ''))) + last_ckpt_fn = pt_fns[-1] + + return last_ckpt_fn, config_path, metadata_path, vocab_path + +def prepare_model_and_dataset_from_config(config: DictConfig, metadata_path:str, vocab_path:str): + nn_params = config.nn_params + dataset_name = config.dataset + vocab_path = Path(vocab_path) + + if 'Encodec' in dataset_name: + encodec_tokens_path = Path(f"dataset/maestro-v3.0.0-encodec_tokens") + encodec_dataset = EncodecDataset(config, encodec_tokens_path, None, None) + vocab_sizes = encodec_dataset.vocab.get_vocab_size() + train_set, valid_set, test_set = encodec_dataset.split_train_valid_test_set() + + lm_model:model_zoo.LanguageModelTransformer= getattr(model_zoo, nn_params.model_name)(config, vocab_sizes) + else: + # print(config) + encoding_scheme = config.nn_params.encoding_scheme + num_features = config.nn_params.num_features + + # get vocab + vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'} + selected_vocab_name = vocab_name[encoding_scheme] + + vocab = getattr(vocab_utils, selected_vocab_name)( + in_vocab_file_path=vocab_path, + event_data=None, + encoding_scheme=encoding_scheme, + num_features=num_features) + + # Initialize symbolic dataset based on dataset name and configuration parameters + symbolic_dataset = getattr(data_utils, dataset_name)( + vocab=vocab, + encoding_scheme=encoding_scheme, + num_features=num_features, + debug=config.general.debug, + aug_type=config.data_params.aug_type, + input_length=config.train_params.input_length, + first_pred_feature=config.data_params.first_pred_feature, + caption_path=config.captions_path if hasattr(config, 'captions_path') else None, + for_evaluation=True, + ) + + vocab_sizes = symbolic_dataset.vocab.get_vocab_size() + print(f"---{nn_params.main_decoder}--- is used") + print(f"---{dataset_name}--- is used") + print(f"---{encoding_scheme}--- is used") + split_ratio = config.data_params.split_ratio + # test_set = [] + train_set, valid_set, test_set = symbolic_dataset.split_train_valid_test_set(dataset_name=config.dataset, ratio=split_ratio, seed=42, save_dir=None) + + # get proper prediction order according to the encoding scheme and target feature in the config + prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params) + + # Create the Transformer model based on configuration parameters + AmadeusModel = getattr(model_zoo, nn_params.model_name)( + vocab=symbolic_dataset.vocab, + input_length=config.train_params.input_length, + prediction_order=prediction_order, + input_embedder_name=nn_params.input_embedder_name, + main_decoder_name=nn_params.main_decoder_name, + sub_decoder_name=nn_params.sub_decoder_name, + sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0, + sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \ + if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False, + dim=nn_params.main_decoder.dim_model, + heads=nn_params.main_decoder.num_head, + depth=nn_params.main_decoder.num_layer, + dropout=nn_params.model_dropout, + ) + + return AmadeusModel, test_set, symbolic_dataset.vocab + +def add_conti_in_valid(tensor, encoding_scheme): + new_target = tensor.clone() + # Assuming tensor shape is [batch, sequence, features] + # Create a shifted version of the tensor + shifted_tensor = torch.roll(new_target, shifts=1, dims=1) + # The first element of each sequence cannot be a duplicate by definition + shifted_tensor[:, 0, :] = new_target[:, 0, :] + 1 + + # Identify where the original and shifted tensors are the same (duplicates) + duplicates = new_target == shifted_tensor + # TODO: convert hard-coded part + # convert values into False except the 1st and 2nd features + if encoding_scheme == 'nb': + if tensor.shape[2] == 5: + # change beat, instrument + duplicates[:, :, 0] = False + duplicates[:, :, 3] = False + duplicates[:, :, 4] = False + elif tensor.shape[2] == 4: + # change beat + duplicates[:, :, 0] = False + duplicates[:, :, 2] = False + duplicates[:, :, 3] = False + elif tensor.shape[2] == 7: + # change beat, chord, tempo + duplicates[:, :, 0] = False + duplicates[:, :, 4] = False + duplicates[:, :, 5] = False + duplicates[:, :, 6] = False + elif encoding_scheme == 'cp': + if tensor.shape[2] == 5: + # change instrument + duplicates[:, :, 0] = False + duplicates[:, :, 1] = False + duplicates[:, :, 3] = False + duplicates[:, :, 4] = False + elif tensor.shape[2] == 7: + # change chord, tempo + duplicates[:, :, 0] = False + duplicates[:, :, 1] = False + duplicates[:, :, 4] = False + duplicates[:, :, 5] = False + duplicates[:, :, 6] = False + + # Replace duplicates with 9999 + new_target[duplicates] = 9999 + return new_target + +# TODO: hard coded +def add_conti(list_of_lists, encoding_scheme): + if encoding_scheme == 'nb': + if len(list_of_lists[0]) == 4: + # type, beat, pitch, duration + for i in range(0, len(list_of_lists)): + if list_of_lists[i][0] == 'SSS': + list_of_lists[i][1] = 'Conti' + elif len(list_of_lists[0]) == 5: + # type, beat, instrument, pitch, duration + previous_instrument = None + for i in range(0, len(list_of_lists)): + if list_of_lists[i][0] == 'SSS': + list_of_lists[i][1] = 'Conti' + if list_of_lists[i][2] == previous_instrument and previous_instrument != 0: + list_of_lists[i][2] = 'Conti' + else: + previous_instrument = list_of_lists[i][2] + elif len(list_of_lists[0]) == 7: + # type, beat, chord, tempo, pitch, duration, velocity + previous_chord = None + previous_tempo = None + for i in range(0, len(list_of_lists)): + if list_of_lists[i][0] == 'SSS': + list_of_lists[i][1] = 'Conti' + if list_of_lists[i][2] == previous_chord and previous_chord != 0: + list_of_lists[i][2] = 'Conti' + elif list_of_lists[i][2] != previous_chord and list_of_lists[i][2] != 0: + previous_chord = list_of_lists[i][2] + if list_of_lists[i][3] == previous_tempo and previous_tempo != 0: + list_of_lists[i][3] = 'Conti' + elif list_of_lists[i][3] != previous_tempo and list_of_lists[i][3] != 0: + previous_tempo = list_of_lists[i][3] + elif encoding_scheme == 'cp': + if len(list_of_lists[0]) == 7: + # type, beat, chord, tempo, pitch, duration, velocity + previous_chord = None + previous_tempo = None + for i in range(0, len(list_of_lists)): + current_chord = list_of_lists[i][2] + current_tempo = list_of_lists[i][3] + if current_chord == previous_chord and current_chord != 0: + list_of_lists[i][2] = 'Conti' + elif current_chord != previous_chord and current_chord != 0: + previous_chord = current_chord + if current_tempo == previous_tempo and current_tempo != 0: + list_of_lists[i][3] = 'Conti' + elif current_tempo != previous_tempo and current_tempo != 0: + previous_tempo = current_tempo + if len(list_of_lists[0]) == 5: + # type, beat, instrument, pitch, duration + previous_instrument = None + for i in range(0, len(list_of_lists)): + current_instrument = list_of_lists[i][2] + if current_instrument == previous_instrument and current_instrument != 0: + list_of_lists[i][2] = 'Conti' + elif current_instrument != previous_instrument and current_instrument != 0: + previous_instrument = current_instrument + return list_of_lists + +class Evaluator: + def __init__(self, + config: DictConfig, + model:AmadeusModel, + test_set:TuneCompiler, + vocab: Union[LangTokenVocab, LangTokenVocab], + device:str='cuda', + batch_size:int=16): + self.config = config + self.device = device + self.vocab = vocab + + self.model = model + self.model.eval() + self.model.to(device) + self.test_set = test_set + + self.input_len = config.train_params.input_length + self.loss_by_class = {key:[] for key in self.vocab.feature_list} + self.count_by_class = {key:0 for key in self.vocab.feature_list} + self.batch_size = batch_size + + self.is_multiclass = True if config.nn_params.encoding_scheme == 'nb' or config.nn_params.encoding_scheme == 'cp' else False + self.first_pred_feature = self.config.data_params.first_pred_feature + + self.neglect_keywords = ['SSS', 'SSN', 'Conti', 'Metrical', 'Note'] + self.valid_item_prob = [] + + # we don't use focal loss on evaluation + self.focal_alpha = 1 + self.focal_gamma = 0 + + def save_results(self, save_fn): + # convert loss_by_clas tensor to cpu + for key in self.loss_by_class.keys(): + self.loss_by_class[key] = torch.tensor(self.loss_by_class[key]).cpu() + self.count_by_class[key] = torch.tensor(self.count_by_class[key]).cpu() + torch.save({'loss_by_class':self.loss_by_class, 'count_by_class':self.count_by_class}, save_fn) + + @torch.inference_mode() + def get_perplexity(self,less_than=256): + for data in tqdm(self.test_set.data_list, desc='Cal over dataset', position=0): + data_tensor = torch.LongTensor(data[0]) + if self.config.nn_params.encoding_scheme == 'nb': + data_tensor = shift_and_pad(data_tensor, self.first_pred_feature) + data_tensor = data_tensor[:-1] + + x_seg = data_tensor[:-1].unsqueeze(0) + y_seg = data_tensor[1:].unsqueeze(0) + self._cal_initial_seg(x_seg, y_seg) + + if x_seg.shape[1] > self.input_len: + cat_logits = [] + cat_y = [] + cat_mask_indices = [] + batch_x = x_seg[0, 1:].unfold(dimension=0, size=self.input_len, step=1) + batch_y = y_seg[0, 1:].unfold(dimension=0, size=self.input_len, step=1) + if self.is_multiclass: + batch_x = batch_x.transpose(1,2) + batch_y = batch_y.transpose(1,2) + for batch_start_idx in tqdm(range(0, min(batch_x.shape[0], less_than), self.batch_size), desc='In piece iter', position=1, leave=False): + x = batch_x[batch_start_idx:batch_start_idx+self.batch_size] + y = batch_y[batch_start_idx:batch_start_idx+self.batch_size] + logits, y,mask_indices = self._cal_following_seg(x, y) + cat_logits.append(logits) + cat_y.append(y) + cat_mask_indices.append(mask_indices) + if self.is_multiclass: + cat_dict = {} + for key in self.vocab.feature_list: + cat_dict[key] = torch.cat([logits_dict[key] for logits_dict in cat_logits], dim=0) + cat_logits = cat_dict + else: + cat_logits = torch.cat(cat_logits, dim=0) + cat_y = torch.cat(cat_y, dim=0) + mask_indices = torch.cat(cat_mask_indices, dim=0) + if self.is_multiclass: + self._update_loss_for_multi_class(cat_logits, cat_y,mask_indices) + else: + cat_prob = torch.nn.functional.softmax(cat_logits, dim=-1) + pt = cat_prob[torch.arange(cat_prob.shape[0]), cat_y] + # focal_loss = -self.focal_alpha * (1-pt)**self.focal_gamma * torch.log(pt) # [batch_size*seq_len] + loss = -torch.log(pt) + self._update_loss_for_single_class(loss, cat_y) + + @torch.inference_mode() + def _update_loss_for_single_class(self, neg_log_prob:torch.Tensor, y:torch.Tensor): + for key in self.vocab.feature_list: + feature_mask = self.vocab.total_mask[key].to(y.device) # [vocab_size,] + mask_for_target = feature_mask[y] # [b*t] + normal_loss_seq_by_class = neg_log_prob[mask_for_target==1] + if mask_for_target.sum().item() != 0: + self.loss_by_class[key] += normal_loss_seq_by_class.tolist() + self.count_by_class[key] += mask_for_target.sum().item() + + @torch.inference_mode() + def _update_loss_for_multi_class(self, logits_dict:dict, tgt:torch.Tensor, mask_indices:torch.Tensor=None): + correct_token_prob = [] + for index, key in enumerate(self.vocab.feature_list): + feat_tgt = tgt[:,index] + logit_values = logits_dict[key] + logit_values = logit_values + prob_values = torch.nn.functional.softmax(logit_values, dim=-1) + # replce the false + correct_token_prob.append(prob_values[torch.arange(prob_values.shape[0]), feat_tgt]) + correct_token_prob = torch.stack(correct_token_prob, dim=1) + # tgt = reverse_shift_and_pad_for_tensor(tgt, self.first_pred_feature) + y_decoded = self.vocab.decode(tgt) + y_decoded = add_conti(y_decoded, self.config.nn_params.encoding_scheme) + # correct_token_prob = reverse_shift_and_pad_for_tensor(correct_token_prob, self.first_pred_feature) + num_notes = logits_dict['pitch'].shape[0] + cum_prob = 1 + max_num = mask_indices.size(0) + for idx in range(max_num): + if max_num != num_notes: + print("not equal",max_num,num_notes) + token = y_decoded[idx] + vaild_mask = mask_indices[idx,:] + token_prob = correct_token_prob[idx].tolist() + for j, key in enumerate(self.vocab.feature_list): + cur_feature = token[j] + whether_predicted = vaild_mask[j] + # clamp cur_prob to avoid when cur_prob is 0 + cur_prob = max(token_prob[j], 1e-10) + if cur_feature == 0: # ignore token + continue + if whether_predicted is False: # skip provided token + continue + if cur_feature in self.neglect_keywords: + cum_prob *= cur_prob + continue + if self.config.nn_params.encoding_scheme == 'cp' and 'time_signature' in cur_feature: + cum_prob *= cur_prob + continue + if self.config.nn_params.encoding_scheme == 'cp' and 'Bar' in cur_feature: + cum_prob = 1 + continue + self.valid_item_prob.append([cur_feature, cur_prob, cur_prob*cum_prob]) + pt = cur_prob*cum_prob + loss = -log(pt) + self.loss_by_class[key].append(loss) + self.count_by_class[key] += 1 + cum_prob = 1 + + @torch.inference_mode() + def _cal_initial_seg(self, x_seg, y_seg): + x, y = x_seg[:, :self.input_len].to(self.device), y_seg[:, :self.input_len].to(self.device) + mask_indices = torch.ones_like(y).bool().to(self.device).flatten(0,1) + if self.config.use_diff is True: + logits,(mask_indices,_) = self.model(x, y) + else: + logits = self.model(x, y) + y = y.flatten(0,1) + if self.is_multiclass: + for key in logits.keys(): + feat_tensor = logits[key].flatten(0,1) + logits[key] = feat_tensor + self._update_loss_for_multi_class(logits, y, mask_indices) + else: + prob = torch.nn.functional.softmax(logits, dim=-1) + prob = prob.flatten(0,1) + pt = prob[torch.arange(len(y)), y] + loss = -torch.log(pt) + self._update_loss_for_single_class(loss, y) + + @torch.inference_mode() + def _cal_following_seg(self, x:torch.Tensor, y:torch.Tensor): + x, y = x.to(self.device), y.to(self.device) + mask_indices = torch.ones_like(y).bool().to(self.device) + if self.config.use_diff is True: + logits,(mask_indices,_) = self.model(x, y) + else: + logits = self.model(x, y) + y = y[:, -1:].flatten(0,1).cpu() + mask_indices = mask_indices.reshape(x.shape)[:,-1:].flatten(0,1).cpu() + if self.is_multiclass: + logits_dict = {} + for key in self.vocab.feature_list: + logits_dict[key] = logits[key][:, -1:].flatten(0,1).cpu() + return logits_dict, y,mask_indices + else: + logits = logits[:, -1:].flatten(0,1).cpu() + return logits, y,mask_indices + + def prepare_prompt_and_ground_truth(self, save_dir, num_target_samples, num_target_measures): + encoding_scheme = self.config.nn_params.encoding_scheme + + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + try: + in_beat_resolution = in_beat_resolution_dict[self.config.dataset] + except KeyError: + in_beat_resolution = 4 # Default resolution if dataset is not found + + midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)(vocab=self.vocab, in_beat_resolution=in_beat_resolution, dataset_name=self.config.dataset) + + for i, (tuneidx, tune_name) in enumerate(self.test_set): + ground_truth_sample = tuneidx + try: + decoder(ground_truth_sample, output_path=str(save_dir / f"{i}_{tune_name}_gt.mid")) + except: + print(f"Error in generating {i}_{tune_name}.mid") + + prompt = self.model.decoder._prepare_inference(start_token=self.model.decoder.net.start_token, manual_seed=0, condition=tuneidx, num_target_measures=num_target_measures) + try: + decoder(prompt, output_path=str(save_dir / f"{i}_{tune_name}_prompt.mid")) + except: + print(f"Error in generating {i}_{tune_name}_prompt.mid") + + if i == num_target_samples: + break + + def generate_samples_with_prompt(self, save_dir, num_target_measures, tuneidx, tune_name, first_pred_feature, sampling_method=None, threshold=None, temperature=1.0,generation_length=3072): + encoding_scheme = self.config.nn_params.encoding_scheme + + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + try: + in_beat_resolution = in_beat_resolution_dict[self.config.dataset] + except KeyError: + in_beat_resolution = 4 # Default resolution if dataset is not found + + midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)(vocab=self.vocab, in_beat_resolution=in_beat_resolution, dataset_name=self.config.dataset) + + tuneidx = tuneidx.cuda() + generated_sample = self.model.generate(0, generation_length, condition=tuneidx, num_target_measures=num_target_measures, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + if encoding_scheme == 'nb': + generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature) + decoder(generated_sample, output_path=str(save_dir / f"{tune_name}.mid")) + + prompt = self.model.decoder._prepare_inference(self.model.decoder.net.start_token, 0, tuneidx, num_target_measures=8) + decoder(prompt, output_path=str(save_dir / f"{tune_name}_prompt.mid")) + + def generate_samples_unconditioned(self, save_dir, num_samples, first_pred_feature, sampling_method, threshold, temperature, generation_length=3072,uid=1): + encoding_scheme = self.config.nn_params.encoding_scheme + + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + try: + in_beat_resolution = in_beat_resolution_dict[self.config.dataset] + except KeyError: + in_beat_resolution = 4 # Default resolution if dataset is not found + midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)(vocab=self.vocab, in_beat_resolution=in_beat_resolution, dataset_name=self.config.dataset) + + for i in range(num_samples): + generated_sample = self.model.generate(0, generation_length, condition=None, num_target_measures=None, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + if encoding_scheme == 'nb': + generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature) + decoder(generated_sample, output_path=str(save_dir / f"{uid}_{i}.mid")) + + def generate_samples_with_text_prompt(self, save_dir, prompt, first_pred_feature, sampling_method, threshold, temperature, generation_length=3072,uid=1): + encoding_scheme = self.config.nn_params.encoding_scheme + tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base') + encoder = T5EncoderModel.from_pretrained('google/flan-t5-base').to(self.device) + print(f"Using T5EncoderModel for text prompt: {prompt}") + context = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(self.device) + context = encoder(**context).last_hidden_state + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + try: + in_beat_resolution = in_beat_resolution_dict[self.config.dataset] + except KeyError: + in_beat_resolution = 4 # Default resolution if dataset is not found + midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)(vocab=self.vocab, in_beat_resolution=in_beat_resolution, dataset_name=self.config.dataset) + + generated_sample = self.model.generate(0, generation_length, condition=None, num_target_measures=None, sampling_method=sampling_method, threshold=threshold, temperature=temperature, context=context) + if encoding_scheme == 'nb': + generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature) + # Open the jsonl file and count the number of lines to determine the current index + jsonl_path = save_dir / "name2prompt.jsonl" + if jsonl_path.exists(): + with open(jsonl_path, 'r') as f: + current_idx = sum(1 for _ in f) + else: + current_idx = 0 + + name = f"prompt_{current_idx}" + name2prompt_dict = defaultdict(list) + name2prompt_dict[name].append(prompt) + with open(jsonl_path, 'a') as f: + f.write(json.dumps(name2prompt_dict) + '\n') + decoder(generated_sample, output_path=str(save_dir / f"{name}_{uid}.mid")) diff --git a/Amadeus/model_zoo.py b/Amadeus/model_zoo.py new file mode 100644 index 0000000..492c8c9 --- /dev/null +++ b/Amadeus/model_zoo.py @@ -0,0 +1,512 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from tqdm.auto import tqdm +import time +import json + +from . import transformer_utils +from . import sub_decoder_zoo +from x_transformers.x_transformers import LayerIntermediates, AbsolutePositionalEmbedding +from data_representation.vocab_utils import LangTokenVocab +import os + +class AmadeusModelWrapper(nn.Module): + def __init__( + self, + *, + vocab:LangTokenVocab, + input_length:int, + prediction_order:list, + input_embedder_name:str, + main_decoder_name:str, + sub_decoder_name:str, + sub_decoder_depth:int, + sub_decoder_enricher_use:bool, + dim:int, + heads:int, + depth:int, + dropout:float + ): + ''' + This class wraps the three main components of the AmadeusModel model, + which are the input embedding layer, the main transformer decoder, and the sub-decoder. + ''' + + super().__init__() + self.vocab = vocab + self.vocab_size = vocab.get_vocab_size() + self.start_token = vocab.sos_token if hasattr(vocab, 'sos_token') else None + self.end_token = vocab.eos_token if hasattr(vocab, 'eos_token') else None + self.input_length = input_length + self.prediction_order = prediction_order + self._get_input_embedder(input_embedder_name, vocab, dropout, dim) + self._get_main_decoder(main_decoder_name, input_length, dim, heads, depth, dropout) + self._get_sub_decoder(sub_decoder_name, prediction_order, vocab, sub_decoder_depth, sub_decoder_enricher_use, dim, heads, dropout) + self.bos_token_hidden = None + + def _get_input_embedder(self, input_embedder_name, vocab, dropout, dim): + self.emb_dropout = nn.Dropout(dropout) + self.input_embedder = getattr(transformer_utils, input_embedder_name)( + vocab=vocab, + dim_model=dim + ) + + def _get_main_decoder(self, main_decoder_name, input_length, dim, heads, depth, dropout): + self.pos_enc = AbsolutePositionalEmbedding(dim, input_length) + self.main_norm = nn.LayerNorm(dim) + self.main_decoder = getattr(transformer_utils, main_decoder_name)( + dim=dim, + depth=depth, + heads=heads, + dropout=dropout + ) + + def _get_sub_decoder(self, sub_decoder_name, prediction_order, vocab, sub_decoder_depth, sub_decoder_enricher_use, dim, heads, dropout): + self.sub_decoder = getattr(sub_decoder_zoo, sub_decoder_name)( + prediction_order=prediction_order, + vocab=vocab, + dim=dim, + sub_decoder_depth=sub_decoder_depth, + heads=heads, + dropout=dropout, + sub_decoder_enricher_use=sub_decoder_enricher_use + ) + + @property + def device(self): + return next(self.parameters()).device + + def forward(self, input_seq:torch.Tensor, target:torch.Tensor, context=None): + embedding = self.input_embedder(input_seq) + self.pos_enc(input_seq) + embedding = self.emb_dropout(embedding) + hidden_vec,layer_inter = self.main_decoder(embedding,train=True, context=context) # B x T x d_model + hidden_vec = self.main_norm(hidden_vec) + input_dict = {'hidden_vec':hidden_vec, 'input_seq': input_seq, 'target': target, 'bos_token_hidden': self.bos_token_hidden} + logits = self.sub_decoder(input_dict) + # 选择总数中离三分之一最近的层 + num_layers = len(layer_inter.layer_hiddens) + idx = round(num_layers / 3) + idx = min(max(idx, 0), num_layers - 1) + input_dict['hidden_vec'] = layer_inter.layer_hiddens[idx] + return logits, input_dict + +class AmadeusModelAutoregressiveWrapper(nn.Module): + def __init__(self, net:AmadeusModelWrapper): + ''' + Initializes an autoregressive wrapper around the AmadeusModelWrapper, + which allows sequential token generation. + + Arguments: + - net: The nested music transformer model that performs the token generation. + ''' + super().__init__() + self.net = net + + def forward(self, input_seq:torch.Tensor, target:torch.Tensor,context=None): + return self.net(input_seq, target, context=context) + + def _prepare_inference(self, start_token, manual_seed, condition=None, num_target_measures=4): + ''' + Prepares the initial tokens for autoregressive inference. If a manual seed is provided, + it sets the seed for reproducibility. If a condition is given, it selects a subset of + the tokens based on certain criteria related to the encoding scheme. + + Arguments: + - start_token: The token that represents the start of a sequence. + - manual_seed: A seed value for reproducibility in inference (if greater than 0). + - condition: An optional tensor used for conditional generation, which helps select a + portion of the input tokens based on the encoding scheme. + + Returns: + - total_out: A tensor containing the initial tokens for inference, padded to ensure compatibility + with the model. + ''' + if manual_seed > 0: + torch.manual_seed(manual_seed) + + total_out = [] + if condition is None: + # Use the start token if no condition is given + total_out.extend(start_token) + else: + # Extract the portion of the sequence depending on encoding scheme (remi, cp, or nb) + if self.net.vocab.encoding_scheme == 'remi': + type_boundaries = self.net.vocab.remi_vocab_boundaries_by_key['type'] + # vocab idx -> 0:SOS, 1:EOS, 2:Bar_without_time_signature, ... where_type_ends:Bar_time_signature_end, ... + measure_bool = (2 <= condition) & (condition < type_boundaries[1]) # between Bar_ts_start and Bar_ts_end + conditional_input_len = torch.where(measure_bool)[0][num_target_measures].item() + elif self.net.vocab.encoding_scheme == 'cp': + # find the start and end of the measure + beat_event2idx = self.net.vocab.event2idx['beat'] + for event, idx in beat_event2idx.items(): + if event == 0: + continue + if event == 'Bar': + start_idx = idx + elif event.startswith('Beat'): + end_idx = idx + break + measure_bool = (condition[:,1] >= start_idx) & (condition[:,1] < end_idx) # measure tokens + conditional_input_len = torch.where(measure_bool)[0][num_target_measures].item() + # measure_bool = (condition[:,1] == 1) # measure tokens + conditional_input_len = torch.where(measure_bool)[0][num_target_measures].item() + elif self.net.vocab.encoding_scheme == 'nb': + measure_bool = (condition[:,0] == 2) | (condition[:,0] >= 5) # Empty measure or where new measure starts + conditional_input_len = torch.where(measure_bool)[0][num_target_measures].item() + + if conditional_input_len == 0: + conditional_input_len = 50 + + selected_tokens = condition[:conditional_input_len].tolist() + total_out.extend(selected_tokens) + + total_out = torch.LongTensor(total_out).unsqueeze(0).to(self.net.device) + return total_out + + def _run_one_step(self, input_seq, cache=None, sampling_method=None, threshold=None, temperature=1, bos_hidden_vec=None,context=None): + ''' + Runs one step of autoregressive generation by taking the input sequence, embedding it, + passing it through the main decoder, and generating logits and a sampled token. + + Arguments: + - input_seq: The input sequence tensor to be embedded and processed. + - cache: Optional cache for attention mechanisms to avoid recomputation. + - sampling_method: Sampling strategy used to select the next token. + - threshold: Optional threshold value for sampling methods that require it. + - temperature: Controls the randomness of predictions (higher temperature increases randomness). + + Returns: + - logits: The predicted logits for the next token. + - sampled_token: The token sampled from the logits. + - intermidiates: Intermediate states from the main decoder, useful for caching. + ''' + embedding = self.net.input_embedder(input_seq) + self.net.pos_enc(input_seq) + embedding = self.net.emb_dropout(embedding) + + # Run through the main decoder and normalize + hidden_vec, intermidiates = self.net.main_decoder(embedding, cache,context_embedding=context) # B x T x d_model + hidden_vec = self.net.main_norm(hidden_vec) + hidden_vec = hidden_vec[:, -1:] # Keep only the last time step + + input_dict = {'hidden_vec': hidden_vec, 'input_seq': input_seq, 'target': None, 'bos_token_hidden': bos_hidden_vec} + + # Generate the next token + logits, sampled_token = self.net.sub_decoder(input_dict, sampling_method, threshold, temperature) + return logits, sampled_token, intermidiates, hidden_vec + + def _update_total_out(self, total_out, sampled_token): + ''' + Updates the output sequence with the newly sampled token. Depending on the encoding scheme, + it either appends the token directly or processes feature-based sampling. + + Arguments: + - total_out: The tensor containing the previously generated tokens. + - sampled_token: The newly generated token to be appended. + + Returns: + - total_out: Updated output tensor with the newly generated token. + - sampled_token: The processed sampled token. + ''' + if self.net.vocab.encoding_scheme == 'remi': + # For remi encoding, directly append the sampled token + total_out = torch.cat([total_out, sampled_token.unsqueeze(0)], dim=-1) + else: + # Handle other encoding schemes by concatenating features + sampled_token_list = [] + for key in self.net.vocab.feature_list: + sampled_token_list.append(sampled_token[key]) + sampled_token = torch.cat(sampled_token_list, dim=-1) + # print(total_out.shape) + if len(sampled_token.shape) == 2: + total_out = torch.cat([total_out, sampled_token.unsqueeze(0)], dim=1) + total_out = torch.cat([total_out, sampled_token.unsqueeze(0).unsqueeze(0)], dim=1) + + return total_out, sampled_token + + @torch.inference_mode() + def generate(self, manual_seed, max_seq_len, condition=None, num_target_measures=4, sampling_method=None, threshold=None, temperature=1, batch_size=1, context=None): + ''' + Autoregressively generates a sequence of tokens by repeatedly sampling the next token + until the desired maximum sequence length is reached or the end token is encountered. + + Arguments: + - manual_seed: A seed value for reproducibility in inference. + - max_seq_len: The maximum length of the generated sequence. + - condition: An optional conditioning sequence to start generation from. + - sampling_method: The method used to sample the next token (e.g., greedy, top-k). + - threshold: Optional threshold for sampling (used in methods like top-p sampling). + - temperature: Controls the randomness of the token sampling process. + - batch_size: The number of sequences to generate in parallel. + + Returns: + - total_out: The generated sequence of tokens as a tensor. + ''' + # Prepare the starting sequence for inference + total_out = self._prepare_inference(self.net.start_token, manual_seed, condition, num_target_measures) + + # If a condition is provided, run one initial step + if condition is not None: + _, _, cache = self._run_one_step(total_out[:, -self.net.input_length:], cache=LayerIntermediates(), sampling_method=sampling_method, threshold=threshold, temperature=temperature, context=context) + else: + cache = LayerIntermediates() + + # Continue generating tokens until the maximum sequence length is reached + pbar = tqdm(total=max_seq_len, desc="Generating tokens", unit="token") + bos_hidden_vec = None + hidden_vec_list = [] + token_time_list = [] + while total_out.shape[1] < max_seq_len: + pbar.update(1) + input_tensor = total_out[:, -self.net.input_length:] + # Generate the next token and update the cache + time_start = time.time() + _, sampled_token, cache, hidden_vec = self._run_one_step(input_tensor, cache=cache, sampling_method=sampling_method, threshold=threshold, temperature=temperature,bos_hidden_vec=bos_hidden_vec, context=context) + time_end = time.time() + token_time_list.append(time_end - time_start) + if bos_hidden_vec is None: + bos_hidden_vec = hidden_vec + hidden_vec_list.append(hidden_vec) + # Update attention cache to handle autoregressive generation + for inter in cache.attn_intermediates: + inter.cached_kv = [t[..., -(self.net.input_length - 1):, :] for t in inter.cached_kv] + + # Update the generated output with the new token + total_out, sampled_token = self._update_total_out(total_out, sampled_token) + + # Stop if the end token is reached + if sampled_token.tolist() == self.net.end_token[0]: + break + # append hidden_vec to pkl + + # save_path = 'hidden/diffnoaug_hidden_vec.pt' + # save_time_path = 'hidden/diff_noaug_token_time.json' + # if os.path.exists(save_path): + # # Load existing list and append + # hidden_vec_all = torch.load(save_path, map_location="cpu") + # hidden_vec_all.extend(hidden_vec_list) + # torch.save(hidden_vec_all, save_path) + # else: + # torch.save(hidden_vec_list, save_path) + + # if os.path.exists(save_time_path): + # # Load existing list and append + # token_time_all = json.load(open(save_time_path, 'r')) + # token_time_all = token_time_all['token_time_list'] + # token_time_all.extend(token_time_list) + # average_time = sum(token_time_all) / len(token_time_all) + # data = { + # 'average_time': average_time, + # 'token_time_list': token_time_all + # } + # json.dump(data, open(save_time_path, 'w'), indent=4) + # else: + # average_time = sum(token_time_list) / len(token_time_list) + # data = { + # 'average_time': average_time, + # 'token_time_list': token_time_list + # } + # json.dump(data, open(save_time_path, 'w'), indent=4) + + return total_out + + def generate_batch(self, manual_seed, max_seq_len, condition=None, num_target_measures=4, sampling_method=None, threshold=None, temperature=1, batch_size=1): + ''' + Autoregressively generates a sequence of tokens by repeatedly sampling the next token + until the desired maximum sequence length is reached or the end token is encountered. + + Arguments: + - manual_seed: A seed value for reproducibility in inference. + - max_seq_len: The maximum length of the generated sequence. + - condition: An optional conditioning sequence to start generation from. + - sampling_method: The method used to sample the next token (e.g., greedy, top-k). + - threshold: Optional threshold for sampling (used in methods like top-p sampling). + - temperature: Controls the randomness of the token sampling process. + - batch_size: The number of sequences to generate in parallel. + + Returns: + - total_out: The generated sequence of tokens as a tensor. + ''' + # Prepare the starting sequence for inference + total_out = self._prepare_inference(self.net.start_token, manual_seed, condition, num_target_measures) + # total_out (1,1,num) -> (bs,1,num) + total_out = total_out.repeat(batch_size, 1, 1) + # If a condition is provided, run one initial step + if condition is not None: + _, _, cache = self._run_one_step(total_out[:, -self.net.input_length:], cache=LayerIntermediates(), sampling_method=sampling_method, threshold=threshold, temperature=temperature) + else: + cache = LayerIntermediates() + + # Continue generating tokens until the maximum sequence length is reached + pbar = tqdm(total=max_seq_len, desc="Generating tokens", unit="token") + while total_out.shape[1] < max_seq_len: + pbar.update(1) + input_tensor = total_out[:, -self.net.input_length:] + + # Generate the next token and update the cache + _, sampled_token, cache = self._run_one_step(input_tensor, cache=cache, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + + # Update attention cache to handle autoregressive generation + for inter in cache.attn_intermediates: + inter.cached_kv = [t[..., -(self.net.input_length - 1):, :] for t in inter.cached_kv] + + # Update the generated output with the new token + total_out, sampled_token = self._update_total_out(total_out, sampled_token) + + # Stop if the end token is reached + if sampled_token.tolist() == self.net.end_token[0]: + break + + return total_out + +class AmadeusModel(nn.Module): + def __init__( + self, + vocab:LangTokenVocab, + input_length:int, + prediction_order:list, + input_embedder_name:str, + main_decoder_name:str, + sub_decoder_name:str, + sub_decoder_depth:int, + sub_decoder_enricher_use:bool, + dim:int, + heads:int, + depth:int, + dropout:float + ): + ''' + This class combines the wrapper classes and initializes the full AmadeusModel model, + which can perform autoregressive sequence generation for symbolic music. + + Vocabulary used for tokenization of the symbolic music data. + Length of the input seqkeuence in tokens. + Defines the order in which features are predicted in a sequence used for compound shift + Name of the input embedding model to be used (e.g., one-hot embedding or learned embeddings). + Name of the main transformer decoder model used for generating the hidden representations for compound tokens. + Name of the sub-decoder, which processes the hidden states and decodes the sub-tokens inside the compound tokens. + Depth (number of layers) of the sub-decoder. + Whether to use an additional enricher module in the sub-decoder to refine representations. + Dimensionality of the model (hidden size of the transformer layers). + Number of attention heads in the transformer layers. + Number of layers in the main decoder. + Dropout rate for all layers in the model. + ''' + + super().__init__() + decoder = AmadeusModelWrapper( + vocab=vocab, + input_length=input_length, + prediction_order=prediction_order, + input_embedder_name=input_embedder_name, + main_decoder_name=main_decoder_name, + sub_decoder_name=sub_decoder_name, + sub_decoder_depth=sub_decoder_depth, + sub_decoder_enricher_use=sub_decoder_enricher_use, + dim=dim, + heads=heads, + depth=depth, + dropout=dropout + ) + self.decoder = AmadeusModelAutoregressiveWrapper( + net=decoder + ) + + def forward(self, input_seq:torch.Tensor, target:torch.Tensor, context=None): + return self.decoder(input_seq, target, context=context) + + @torch.inference_mode() + def generate(self, manual_seed, max_seq_len, condition=None, num_target_measures=4, sampling_method=None, threshold=None, temperature=1,batch_size=1,context=None): + if batch_size == 1: + return self.decoder.generate(manual_seed, max_seq_len, condition, num_target_measures, sampling_method, threshold, temperature, context=context) + else: + return self.decoder.generate_batch(manual_seed, max_seq_len, condition, num_target_measures, sampling_method, threshold, temperature, batch_size, context=context) + +class AmadeusModel4Encodec(AmadeusModel): + def __init__( + self, + vocab:LangTokenVocab, + input_length:int, + prediction_order:list, + input_embedder_name:str, + main_decoder_name:str, + sub_decoder_name:str, + sub_decoder_depth:int, + sub_decoder_enricher_use:bool, + dim:int, + heads:int, + depth:int, + dropout:float + ): + super().__init__( + vocab=vocab, + input_length=input_length, + prediction_order=prediction_order, + input_embedder_name=input_embedder_name, + main_decoder_name=main_decoder_name, + sub_decoder_name=sub_decoder_name, + sub_decoder_depth=sub_decoder_depth, + sub_decoder_enricher_use=sub_decoder_enricher_use, + dim=dim, + heads=heads, + depth=depth, + dropout=dropout + ) + + def _prepare_inference(self, start_token, manual_seed, condition=None): + if manual_seed > 0: + torch.manual_seed(manual_seed) + total_out = [] + if condition is None: + total_out.extend(start_token) + else: + if self.decoder.net.vocab.encoding_scheme == 'remi': + selected_tokens = condition[:1500].tolist() + else: + selected_tokens = condition[:500].tolist() + total_out.extend(selected_tokens) + total_out = torch.LongTensor(total_out).unsqueeze(0).to(self.decoder.net.device) + return total_out + + def _update_total_out(self, total_out, sampled_token): + if self.decoder.net.vocab.encoding_scheme == 'remi': + total_out = torch.cat([total_out, sampled_token.unsqueeze(0)], dim=-1) + else: + sampled_token_list = [] + for key in self.decoder.net.vocab.feature_list: + sampled_token_list.append(sampled_token[key]) + sampled_token = torch.cat(sampled_token_list, dim=-1) # B(1) x num_features + total_out = torch.cat([total_out, sampled_token.unsqueeze(0).unsqueeze(0)], dim=1) + return total_out, sampled_token + + def _run_one_step(self, input_seq, cache=None, sampling_method=None, threshold=None, temperature=1): + embedding = self.decoder.net.input_embedder(input_seq) + self.decoder.net.pos_enc(input_seq) + embedding = self.decoder.net.emb_dropout(embedding) + hidden_vec, intermidiates = self.decoder.net.main_decoder(embedding, cache) # B x T x d_model + hidden_vec = self.decoder.net.main_norm(hidden_vec) + hidden_vec = hidden_vec[:, -1:] # B x 1 x d_model + input_dict = {'hidden_vec':hidden_vec, 'input_seq': input_seq, 'target': None} + if self.decoder.net.vocab.encoding_scheme == 'remi': + feature_class_idx = (input_seq.shape[1] - 1) % 4 + feature_type = self.decoder.net.vocab.feature_list[feature_class_idx] + logits, sampled_token = self.decoder.net.sub_decoder.run_one_step(input_dict, sampling_method, threshold, temperature, feature_type) + else: + logits, sampled_token = self.decoder.net.sub_decoder(input_dict, sampling_method, threshold, temperature) + return logits, sampled_token, intermidiates + + @torch.inference_mode() + def generate(self, manual_seed, max_seq_len, condition=None, sampling_method=None, threshold=None, temperature=1): + total_out = self._prepare_inference(self.decoder.net.start_token, manual_seed, condition) + if condition is not None: + _, _, cache = self._run_one_step(total_out[:, -self.decoder.net.input_length:], cache=LayerIntermediates(), sampling_method=sampling_method, threshold=threshold, temperature=temperature) + else: + cache = LayerIntermediates() + while total_out.shape[1] < max_seq_len: + input_tensor = total_out[:, -self.decoder.net.input_length:] + _, sampled_token, cache = self._run_one_step(input_tensor, cache=cache, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + for inter in cache.attn_intermediates: + inter.cached_kv = [t[..., -(self.decoder.net.input_length - 1):, :] for t in inter.cached_kv] # B x num_heads x T x d_head + total_out, sampled_token = self._update_total_out(total_out, sampled_token) + if sampled_token.tolist() == self.decoder.net.end_token[0]: + break + return total_out \ No newline at end of file diff --git a/Amadeus/sampling_utils.py b/Amadeus/sampling_utils.py new file mode 100644 index 0000000..28f652b --- /dev/null +++ b/Amadeus/sampling_utils.py @@ -0,0 +1,168 @@ +import torch +import torch.nn.functional as F + +def top_p_sampling(logits, thres=0.9): + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + sorted_indices_to_remove = cum_probs > thres + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # Create an empty tensor to hold the new logits + new_logits = logits.clone() + + # Use the sorted indices to place the '-inf' in the original places + indices_to_remove = sorted_indices[sorted_indices_to_remove] + new_logits[..., indices_to_remove] = float('-inf') + return new_logits + + +# refered: https://github.com/cimeister/typical-sampling +def typical_sampling(logits, thres=0.99): + # calculate entropy + normalized = torch.nn.functional.log_softmax(logits, dim=-1) + p = torch.exp(normalized) + ent = -(normalized * p).nansum(-1, keepdim=True) + + # shift and sort + shifted_scores = torch.abs((-normalized) - ent) + sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False) + sorted_logits = logits.gather(-1, sorted_indices) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Remove tokens with cumulative mass above the threshold + last_ind = (cumulative_probs < thres).sum(dim=-1) + last_ind[last_ind < 0] = 0 + sorted_indices_to_remove = sorted_scores > sorted_scores.gather(-1, last_ind.view(-1, 1, 1)) + # if self.min_tokens_to_keep > 1: + # # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + # sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0 + indices_to_remove = sorted_indices_to_remove.scatter(2, sorted_indices, sorted_indices_to_remove) + + scores = logits.masked_fill(indices_to_remove, float("-inf")) + return scores + +def add_gumbel_noise(logits, temperature): + ''' + The Gumbel max is a method for sampling categorical distributions. + According to arXiv:2409.02908, for MDM, low-precision Gumbel Max improves perplexity score but reduces generation quality. + Thus, we use float64. + ''' + if temperature == 0: + return logits + logits = logits.to(torch.float64) + noise = torch.rand_like(logits, dtype=torch.float64) + gumbel_noise = (- torch.log(noise)) ** temperature + return logits.exp() / gumbel_noise + # +# refered: https://github.com/john-hewitt/truncation-sampling +def eta_sampling(logits, epsilon) -> torch.FloatTensor: + probabilities = logits.softmax(dim=-1) + entropy = torch.distributions.Categorical(probs=probabilities).entropy() + new_epsilon = min(epsilon, torch.sqrt(torch.tensor(epsilon))*torch.exp(-entropy)) + indices_to_remove = probabilities < new_epsilon + max_word = torch.argmax(logits, dim=-1) + indices_to_remove[..., max_word.squeeze()] = 0 + new_scores = logits.masked_fill(indices_to_remove, float("-inf")) + return new_scores + +def sample(logits, sampling_method, threshold, temperature): + """Sample from the logits with a specific sampling strategy.""" + if sampling_method == "top_p": + probs = F.softmax(top_p_sampling(logits, thres=threshold) / temperature, dim=-1) + elif sampling_method == "typical": + probs = F.softmax(typical_sampling(logits, thres=threshold) / temperature, dim=-1) + elif sampling_method == "eta": + probs = F.softmax(eta_sampling(logits, epsilon=threshold) / temperature, dim=-1) + else: + probs = F.softmax(logits / temperature, dim=-1) + return torch.multinomial(probs[-1,-1,:], 1) + +def sample_with_prob(logits, sampling_method, threshold, temperature): + """Sample from the logits with a specific sampling strategy and return the token and its probability.""" + # temporarily apply the sampling method to logits + logits = logits / temperature + # logits = add_gumbel_noise(logits, temperature) + + if sampling_method == "top_p": + modified_logits = top_p_sampling(logits, thres=threshold) + elif sampling_method == "typical": + modified_logits = typical_sampling(logits, thres=threshold) + elif sampling_method == "eta": + modified_logits = eta_sampling(logits, epsilon=threshold) + else: + modified_logits = logits # 其他情况直接使用原始logits + + # print(modified_logits.shape) + # 应用温度调整并计算概率 + # probs = F.softmax(modified_logits / temperature, dim=-1) + probs = F.softmax(modified_logits, dim=-1) + + # 获取最后一个位置的概率分布 + # probs_last = probs[-1, -1, :] + # print(probs.shape) + probs_last = probs[-1, -1, :] + + # 采样 + sampled_token = torch.multinomial(probs_last, num_samples=1) + # 获取对应的概率值 + prob_value = probs_last[sampled_token] + + return sampled_token, prob_value.squeeze() + +def top_p_sampling_fast(logits, thres=0.9): + """ + logits: Tensor of shape [B, L, V] + Returns: logits with low-prob tokens masked as -inf, shape [B, L, V] + """ + # Step 1: sort logits and get indices + sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True) # [B, L, V] + + # Step 2: compute cumulative probs + probs = F.softmax(sorted_logits, dim=-1) # [B, L, V] + cum_probs = torch.cumsum(probs, dim=-1) # [B, L, V] + + # Step 3: mask tokens beyond cumulative threshold + sorted_mask = cum_probs > thres + sorted_mask[..., 1:] = sorted_mask[..., :-1].clone() + sorted_mask[..., 0] = False # always keep at least one token + + # Step 4: scatter back to original order + # Create mask of same shape as logits, default False + mask = torch.zeros_like(logits, dtype=torch.bool) # [B, L, V] + mask = mask.scatter(-1, sorted_indices, sorted_mask) + + # Step 5: mask logits + logits = logits.masked_fill(mask, float('-inf')) # final masked logits + + return logits + +def sample_with_prob_fast(logits, sampling_method="top_p", threshold=0.9, temperature=1.0, mask_indices=None): + """ + logits: [B*T, num_sub_tokens, vocab_size] + mask_indices: mask indicating which tokens to sample, shape = [B*T, num_sub_tokens] + """ + if temperature != 1.0: + logits = logits / temperature + + if sampling_method == "top_p": + logits = top_p_sampling_fast(logits, thres=threshold) # should support batch + elif sampling_method == "typical": + logits = typical_sampling(logits, thres=threshold) + elif sampling_method == "eta": + logits = eta_sampling(logits, epsilon=threshold) + # else: keep logits as-is + + probs = torch.softmax(logits, dim=-1) # [B*T, num_sub_tokens, vocab_size] + + B, L, V = probs.shape + probs_flat = probs.view(-1, V) # [(B*T * num_sub_tokens), V] + + # 采样:multinomial 不能一次性处理 3D,展平后采样 + sampled = torch.multinomial(probs_flat, num_samples=1) # [(B*T * num_sub_tokens), 1] + sampled = sampled.view(B, L) # [B*T, num_sub_tokens] + + sampled_probs = torch.gather(probs, 2, sampled.unsqueeze(-1)).squeeze(-1) # [B*T, num_sub_tokens] + + return sampled, sampled_probs diff --git a/Amadeus/sub_decoder_utils.py b/Amadeus/sub_decoder_utils.py new file mode 100644 index 0000000..3109ef4 --- /dev/null +++ b/Amadeus/sub_decoder_utils.py @@ -0,0 +1,228 @@ +from math import ceil + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class MLP(nn.Module): + def __init__(self, in_size, out_size, hidden_size, dropout): + super().__init__() + self.out_size = out_size + self.layer = nn.Sequential( + nn.Linear(in_size, hidden_size), + nn.Dropout(dropout), + nn.ReLU(), + nn.Linear(hidden_size, out_size) + ) + + def forward(self, x): + return self.layer(x) + +class extendedMLP(nn.Module): + def __init__(self, in_size, out_size, num_layers, hidden_size, dropout): + super().__init__() + self.input_size = in_size + + self.layers = nn.ModuleList() + if num_layers == 1: + # Only one layer + self.layers.append(nn.Linear(in_size, out_size)) + return + elif num_layers > 1: + # First layer + self.layers.append(nn.Linear(in_size, hidden_size)) + self.layers.append(nn.Dropout(dropout)) + self.layers.append(nn.ReLU()) + # Intermediate layers + if num_layers > 2: + for _ in range(num_layers - 2): # -2 because we're manually adding the first and last layers + self.layers.append(nn.Linear(hidden_size, hidden_size)) + self.layers.append(nn.Dropout(dropout)) + self.layers.append(nn.ReLU()) + # Last layer + self.layers.append(nn.Linear(hidden_size, out_size)) + else: + raise ValueError("num_layers should be a positive integer") + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + +class multiMLP(nn.Module): + def __init__(self, in_size, out_size, hidden_size, dropout, pred_order): + super().__init__() + self.out_size = out_size + self.layer = nn.ModuleList([MLP(in_size, out_size, hidden_size, dropout) for _ in pred_order]) + + def forward(self, x, choice): + ''' + x: B x T x d_model + choice: token type from self.pred_order (str or list of str) + ''' + if isinstance(choice, str): + idx = self.pred_order.index(choice) + return self.layer[idx](x) + elif len(choice) > 1 and not isinstance(choice, str): + raise ValueError("multiMLP doesn't support parallel prediction") + +class ResidualLayerNormModule(nn.Module): + def __init__(self, submodule: nn.Module): + super().__init__() + self.submodule = submodule + if submodule.__class__.__name__ == 'MultiheadAttention': + self.layer_norm = nn.LayerNorm(self.submodule.embed_dim) + else: + self.layer_norm = nn.LayerNorm(self.submodule.input_size) + + def forward_attention(self, q, k, v, attn_mask, type): + attn_output, _ = self.submodule(q, k, v, attn_mask=attn_mask, need_weights=False, average_attn_weights=False) + return self.layer_norm(attn_output + q) + + def forward_mlp(self, x): + return self.layer_norm(self.submodule(x) + x) + +class MultiProj_hidden2logit(nn.Module): + def __init__(self, dim, vocab_sizes): + super().__init__() + self.layers = nn.ModuleDict({ + f"layer_{key}": nn.Linear(dim, size) for key, size in vocab_sizes.items() + }) + + def forward(self, hidden_vec, feature): + logit = self.layers[f"layer_{feature}"](hidden_vec) + return logit + +class MultiProj_catvec2hidden(nn.Module): + def __init__(self, config, par_pred_keys, seq_pred_keys): + super().__init__() + ''' + This class is used in SQstyleEachEmbStrategy + par_pred_keys: list of independent features(These tokens are predicted in parallel) + seq_pred_keys: list of sequential features(These tokens are predicted sequentially) + ''' + net_param = config.nn_params + self.d_model = net_param.model.d_model + independent_emb_size = 0 + for key in par_pred_keys: + independent_emb_size += net_param.emb[key] + self.layers = nn.ModuleDict({ + 'layer_independent': nn.Linear(self.d_model + independent_emb_size, self.d_model), + **{f"layer_{key}": nn.Linear(self.d_model + net_param.emb[key], self.d_model) for key in seq_pred_keys} + }) + self.par_pred_keys = par_pred_keys + self.seq_pred_keys = seq_pred_keys + self.dropout = nn.Dropout(0.1) + self.relu = nn.ReLU() + + def forward(self, x, choice): + ''' + x: B x T x (d_model + emb_size) + choice: key type (str or list of str) + ''' + if isinstance(choice, str): # single key + assert choice in self.seq_pred_keys + output = self.layers[f"layer_{choice}"](x) + return self.relu(self.dropout(output)) + elif len(choice) > 1 and not isinstance(choice, str): # multiple keys, parallel + assert choice == self.par_pred_keys # the order of choice should be the same as the order of self.par_pred_keys + output = self.layers['layer_independent'](x) + return self.relu(self.dropout(output)) + +def mask_tensor(tensor, mask_rate=0.15): + # Get the size of the tensor + batch_size, seq_len, dim = tensor.size() + # Calculate the total number of elements and the number to mask + total_elements = batch_size * seq_len + num_to_mask = int(total_elements * mask_rate) + # Create a 1D binary mask where 1 indicates that element will be masked. + # Start by creating a tensor of zeros with length equal to the total number of elements. + mask = torch.zeros(total_elements).to(tensor.device) + # Set `num_to_mask` random indices to 1 (masking) + indices_to_mask = torch.randperm(total_elements)[:num_to_mask] + mask[indices_to_mask] = 1 + # Reshape the mask to match the original tensor's shape + mask = mask.reshape(batch_size, seq_len) + mask = mask.unsqueeze(2) # B x T x 1 + masked_tensor = tensor * (mask == 0).float() # B x T x d_model + return masked_tensor + +def generate_causality_mask_on_window(size, window_size): + mask = torch.zeros((size, size)) + for i in range(size): + mask[i, i+window_size:] = 1 + return mask.bool() + +# generate boolean mask, if the value is 1 or true, it means the value is masked +# considers BOS token and mask margin +def generate_CA_mask(tgt_len, memory_len, mask_margin=0): + mask = torch.triu(torch.ones((tgt_len, memory_len)), diagonal=mask_margin+1) + return mask.bool() + +# generate boolean mask, if the value is 1 or true, it means the value is masked +def generate_SA_mask(tgt_len): + mask = torch.triu(torch.ones((tgt_len, tgt_len)), diagonal=1) + return mask.bool() + +def generate_none_causality_mask(tgt_len, memory_len): + mask = torch.zeros((tgt_len, memory_len)) + return mask.bool() + +class DecoderLayer(nn.Module): + def __init__(self, dim, num_heads, dropout): + super().__init__() + self.cross_attn_block = ResidualLayerNormModule(nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True)) + self.residual_FF = ResidualLayerNormModule(extendedMLP(in_size=dim, out_size=dim, num_layers=2, hidden_size=2048, dropout=dropout)) + self.dropout = nn.Dropout(dropout) + + def forward(self, input_dict): + ''' + input_dict = {'input_seq': input_seq, 'memory': memory, 'memory_mask': CA_attn_mask} + ''' + # cross attention + attn_output = self.cross_attn_block.forward_attention(input_dict['input_seq'], input_dict['memory'], input_dict['memory'], input_dict['memory_mask'], type='cross') + attn_output = self.residual_FF.forward_mlp(attn_output) + attn_output = self.dropout(attn_output) + output_dict = {'input_seq': attn_output, 'memory': input_dict['memory'], 'memory_mask': input_dict['memory_mask']} + return output_dict + +class TransformerLayer(nn.Module): + def __init__(self, dim, num_heads, dropout): + super().__init__() + self.self_attn_block = ResidualLayerNormModule(nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True)) + self.cross_attn_block = ResidualLayerNormModule(nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True)) + self.residual_FF = ResidualLayerNormModule(extendedMLP(in_size=dim, out_size=dim, num_layers=2, hidden_size=2048, dropout=dropout)) + self.dropout = nn.Dropout(dropout) + + def forward(self, input_dict): + ''' + input_dict = {'input_seq': input_seq, 'memory': memory, 'memory_mask': CA_attn_mask} + ''' + # self attention + attn_output = self.self_attn_block.forward_attention(input_dict['input_seq'], input_dict['input_seq'], input_dict['input_seq'], input_dict['memory_mask'], type='self') + + input_dict['input_seq'] = attn_output + # cross attention + attn_output = self.cross_attn_block.forward_attention(input_dict['input_seq'], input_dict['memory'], input_dict['memory'], input_dict['memory_mask'], type='cross') + attn_output = self.residual_FF.forward_mlp(attn_output) + attn_output = self.dropout(attn_output) + output_dict = {'input_seq': attn_output, 'memory': input_dict['memory'], 'memory_mask': input_dict['memory_mask']} + return output_dict + +class FeatureEnricher(nn.Module): + def __init__(self, dim, num_heads, dropout): + super().__init__() + self.cross_attn_block = ResidualLayerNormModule(nn.MultiheadAttention(embed_dim=dim, num_heads=num_heads, dropout=dropout, batch_first=True)) + self.residual_FF = ResidualLayerNormModule(extendedMLP(in_size=dim, out_size=dim, num_layers=2, hidden_size=2048, dropout=dropout)) + self.dropout = nn.Dropout(dropout) + + def forward(self, input_dict): + ''' + input_dict = {'input_seq': input_seq, 'memory': memory} + ''' + # cross attention + attn_output = self.cross_attn_block.forward_attention(input_dict['input_seq'], input_dict['memory'], input_dict['memory'], None, type='feature_enrichment') + attn_output = self.residual_FF.forward_mlp(attn_output) + attn_output = self.dropout(attn_output) + output_dict = {'input_seq': attn_output, 'memory': input_dict['memory']} + return output_dict \ No newline at end of file diff --git a/Amadeus/sub_decoder_zoo.py b/Amadeus/sub_decoder_zoo.py new file mode 100644 index 0000000..e0994d6 --- /dev/null +++ b/Amadeus/sub_decoder_zoo.py @@ -0,0 +1,1280 @@ +from selectors import EpollSelector +from turtle import st +from numpy import indices +from sympy import Trace, false, true +import torch +import torch.profiler +import torch.nn as nn + +from x_transformers import Decoder + +from .transformer_utils import MultiEmbedding, RVQMultiEmbedding +from .sub_decoder_utils import * +from .sampling_utils import sample, sample_with_prob, sample_with_prob_fast, top_p_sampling, typical_sampling, eta_sampling + +from data_representation.vocab_utils import LangTokenVocab + +class SingleProjection(nn.Module): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + ''' + This sub-decoder is used for REMI based models + ''' + super().__init__() + vocab_size = vocab.get_vocab_size() + self.proj = nn.Linear(dim, vocab_size) + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=1): + hidden_vec = input_dict['hidden_vec'] + target = input_dict['target'] + # ---- Generate(Inference) ---- # + if target is None: + logits = self.proj(hidden_vec[:, -1:]) + sampled_token = sample(logits, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + return logits, sampled_token + # ---- Training ---- # + logits = self.proj(hidden_vec) + return logits + +class SubDecoderClass(nn.Module): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + super().__init__() + ''' + This is the base class for all sub-decoders + ''' + self.prediction_order = prediction_order + self.vocab = vocab + self.vocab_size = vocab.get_vocab_size() + # make layers + self._make_emb_layer(vocab, dim) + self._make_projection_layer(vocab, dim) + self._make_nonlinear_layer() + + @property + def device(self): + return next(self.parameters()).device + + def _make_emb_layer(self, vocab, dim): + self.emb_layer = MultiEmbedding( + vocab=vocab, + dim_model=dim + ) + + # def _make_projection_layer(self, vocab, dim): + # vocab_sizes = vocab.get_vocab_size() + # self.hidden2logit = nn.ModuleDict({ + # f"layer_{key}": nn.Linear(dim, size) for key, size in vocab_sizes.items() + # }) + + def _make_nonlinear_layer(self): + pass + def _make_projection_layer(self, vocab, dim): + vocab_sizes = vocab.get_vocab_size() + self.vocab_sizes = vocab_sizes + self.max_vocab_size = max(vocab_sizes.values()) + self.projection_keys = list(vocab_sizes.keys()) # For index order + + # ✅ 保留原来的 Linear 层(这样 state_dict 可以匹配) + self.hidden2logit = nn.ModuleDict({ + f"layer_{key}": nn.Linear(dim, size) for key, size in vocab_sizes.items() + }) + + # # ✅ 构建用于 block 并行的权重 + # weight_list = [] + # bias_list = [] + + # for key in self.projection_keys: + # layer = self.hidden2logit[f"layer_{key}"] + # w = layer.weight + # b = layer.bias + + # # pad to max_vocab_size + # w_padded = F.pad(w, (0, 0, 0, self.max_vocab_size - w.shape[0])) + # b_padded = F.pad(b, (0, self.max_vocab_size - b.shape[0])) + + # weight_list.append(w_padded.unsqueeze(0)) # (1, Vmax, D) + # bias_list.append(b_padded.unsqueeze(0)) # (1, Vmax) + + # self.register_buffer("proj_weight", torch.cat(weight_list, dim=0)) # (F, Vmax, D) + # self.register_buffer("proj_bias", torch.cat(bias_list, dim=0)) # (F, Vmax) +class FeedForward(SubDecoderClass): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + ''' + FeedForward sub-decoder is used for compound token like CP or NB. + We followed the original sub-decoder proposed in the paper "Compound Word Transformer", + however the embedding size for each sub-token or musical feature is the same in our implementation. + The reason for that is we didn't find any significant difference in the performance of the model + + There are two types of decoding style for the FeedForward sub-decoder: + 1. Partial-sequential prediction: Predict type token first and then predict all the sub-tokens in parallel (origianl CP) + 2. Fully-sequential prediction: Predict all the sub-tokens sequentially + ''' + super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use) + + def _make_projection_layer(self, vocab, dim): + vocab_sizes = vocab.get_vocab_size() + self.hidden2logit = nn.ModuleDict({ + f"layer_{key}": nn.Linear(dim, size) for key, size in vocab_sizes.items() + }) + self.catvec2hidden = nn.ModuleDict({ + f"layer_{key}": nn.Linear(dim+dim, dim) for key, _ in vocab_sizes.items() + }) + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] + target = input_dict['target'] + + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + for feature in self.prediction_order: + if isinstance(feature, str): + logit = self.hidden2logit[f"layer_{feature}"](hidden_vec) + logits_dict[feature] = logit + sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + sampled_token_dict[feature] = sampled_token + feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) # B x T x emb_size + catvec = torch.cat([hidden_vec, feature_emb.unsqueeze(0)], dim=-1) + hidden_vec = self.catvec2hidden[f"layer_{feature}"](catvec) + else: + assert feature == self.prediction_order[-1], "Parallel prediction should be the last feature" + for par_feature in feature: + logit = self.hidden2logit[f"layer_{par_feature}"](hidden_vec) + logits_dict[par_feature] = logit + sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + sampled_token_dict[par_feature] = sampled_token + return logits_dict, sampled_token_dict + + # ---- Training ---- # + for feature in self.prediction_order: + if isinstance(feature, str): + logit = self.hidden2logit[f"layer_{feature}"](hidden_vec) + logits_dict[feature] = logit + feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., self.vocab.feature_list.index(feature)]) # B x T x emb_size + catvec = torch.cat([hidden_vec, feature_emb], dim=-1) + hidden_vec = self.catvec2hidden[f"layer_{feature}"](catvec) + else: + assert feature == self.prediction_order[-1], "Parallel prediction should be the last feature" + for par_feature in feature: + logit = self.hidden2logit[f"layer_{par_feature}"](hidden_vec) + logits_dict[par_feature] = logit + return logits_dict + +class Parallel(SubDecoderClass): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + ''' + Parallel sub-decoder is used for parallel prediction of multiple sub-tokens or musical features + This method is proposed in the paper "Multitrack Music Transformer" + ''' + super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use) + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] + target = input_dict['target'] + + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + for feature in self.prediction_order: + logit = self.hidden2logit[f"layer_{feature}"](hidden_vec) # B x T x vocab_size + logits_dict[feature] = logit + sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + sampled_token_dict[feature] = sampled_token + return logits_dict, sampled_token_dict + + # ---- Training ---- # + for feature in self.prediction_order: + logit = self.hidden2logit[f"layer_{feature}"](hidden_vec) + logits_dict[feature] = logit + return logits_dict + +class RNN(SubDecoderClass): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + ''' + RNN sub-decoder is used for sequential prediction of multiple sub-tokens or musical features + This method is similar to the method proposed in "PianoTree VAE" + ''' + super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use) + self.feature_order_in_output = {key: (idx-len(prediction_order)) for idx, key in enumerate(prediction_order)} + + self.pos_enc = nn.Embedding(len(prediction_order), dim) + nn.init.zeros_(self.pos_enc.weight) + + self.decoding_rnn = nn.GRU( + input_size=dim, + hidden_size=dim, + num_layers=sub_decoder_depth, + dropout=dropout, + batch_first=True) + + def _apply_pos_enc(self, tgt, apply_type='last'): + if apply_type == 'all': + pos = torch.arange(tgt.shape[1]).to(tgt.device) + pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) + tgt_pos = tgt + self.pos_enc(pos.long()) + elif apply_type == 'last': + pos = torch.arange(tgt.shape[1]).to(tgt.device) + pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) + pos_emb = self.pos_enc(pos.long()) + # zero out the pos_emb except for the last token + pos_emb[:, :-1, :] = 0 + tgt_pos = tgt + pos_emb + return tgt_pos + + def _prepare_token_embedding_for_teacher_forcing(self, input_seq, target): + for feature in self.prediction_order[:-1]: + feature_idx = self.vocab.feature_list.index(feature) + feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size + feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size + input_seq = torch.cat([input_seq, feature_emb_reshape], dim=1) + return input_seq + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] # B x T x d_model + target = input_dict['target'] # B x T x num_sub_tokens-1 + hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], -1)).unsqueeze(1) # (B*T) x 1 x d_model + input_seq = hidden_vec_reshape # (B*T) x 1 x d_model + + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + h_0 = input_seq[:, 0, :].unsqueeze(0) # 1 x (B*T) x d_model + input_seq = self._apply_pos_enc(input_seq, apply_type='all') # (B*T) x 1 x d_model + for idx, feature in enumerate(self.prediction_order): + input_seq, _ = self.decoding_rnn(input_seq, h_0) # input_seq: (B*T) x (idx+1) x hidden_size, h_n: num_layers x (B*T) x hidden_size + logit = self.hidden2logit[f"layer_{feature}"](input_seq[:, -1, :]) # (B*T) x vocab_size + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_dict[feature] = logit + sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + sampled_token_dict[feature] = sampled_token + if idx == len(self.prediction_order)-1: + return logits_dict, sampled_token_dict + feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) # B x T x emb_size + feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size + input_seq = torch.cat([input_seq, feature_emb_reshape], dim=1) # (B*T) x (idx+2) x d_model + input_seq = self._apply_pos_enc(input_seq, apply_type='last') # (B*T) x (idx+2) x d_model + return logits_dict, sampled_token_dict + + # ---- Training ---- # + input_seq = self._prepare_token_embedding_for_teacher_forcing(input_seq, target) # (B*T) x len(prediction_order) x d_model + # initial hidden state has no positional encoding + h0 = input_seq[:, 0, :].unsqueeze(0) # 1 x (B*T) x d_model + h0 = h0.contiguous() + # apply positional encoding + input_seq = self._apply_pos_enc(input_seq, apply_type='all') # (B*T) x len(prediction_order) x d_model + # get output using rnn + output, _ = self.decoding_rnn(input_seq, h0) # (B*T) x len(prediction_order) x d_model + output = output.reshape((hidden_vec.shape[0], hidden_vec.shape[1], len(self.prediction_order), -1)) # B x T x len(prediction_order) x d_model + for idx, feature in enumerate(self.prediction_order): + logit = self.hidden2logit[f"layer_{feature}"](output[:, :, idx, :]) # B x T x vocab_size + logits_dict[feature] = logit + return logits_dict + +class SelfAttention(SubDecoderClass): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + ''' + This sub-decoder is used for sequential prediction of multiple sub-tokens or musical features + This method is similar to the method proposed in "UniAudio", but different in making the sequence of sub-tokens. + The UniAudio adds the output of the main decoder or hidden vec directly to embedding of the sub-token, + while our method puts the hidden vec in the input sequence so that the attention mechanism can learn the relationship between the hidden vec and the sub-token + ''' + super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use) + self.feature_order_in_output = {key: (idx-len(prediction_order)) for idx, key in enumerate(prediction_order)} + + self.pos_enc = nn.Embedding(1 + len(prediction_order), dim) + nn.init.zeros_(self.pos_enc.weight) + + self.sub_decoder_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True) + + window_size = 1 # number of previous output of the main decoder to be used in the sub-decoder + causal_mask = generate_causality_mask_on_window(size=window_size + len(prediction_order), window_size=window_size) + self.register_buffer('causal_mask', causal_mask) + + self.transformer_decoder = Decoder( + dim = dim, + depth = sub_decoder_depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def _apply_pos_enc(self, tgt, apply_type='last'): + if apply_type == 'all': + pos = torch.arange(tgt.shape[1]).to(tgt.device) + pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) + tgt_pos = tgt + self.pos_enc(pos.long()) + elif apply_type == 'last': + pos = torch.arange(tgt.shape[1]).to(tgt.device) + pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) + pos_emb = self.pos_enc(pos.long()) # (B*T) x (window_size + BOS + num_sub_tokens-1) x dim + # zero out the pos_emb except for the last token + pos_emb[:, :-1, :] = 0 + tgt_pos = tgt + pos_emb + return tgt_pos + + def _prepare_input_seq_list(self, hidden_vec_reshape, target=None): + input_seq_list = [] + input_seq_list.append(hidden_vec_reshape) + BOS_emb = self.sub_decoder_BOS_emb.unsqueeze(0).repeat(hidden_vec_reshape.shape[0], 1, 1) # (B*T) x 1 x d_model + if target is None: + input_seq_list.append(BOS_emb[-1:, :, :]) + else: # training + input_seq_list.append(BOS_emb) + return input_seq_list + + def _prepare_token_embedding_for_teacher_forcing(self, input_seq_list, target): + for feature in self.prediction_order[:-1]: + feature_idx = self.vocab.feature_list.index(feature) + feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size + feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size + input_seq_list.append(feature_emb_reshape) + memory_tensor = torch.cat(input_seq_list, dim=1) # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model + return memory_tensor + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] # B x T x d_model + target = input_dict['target'] # B x T x num_sub_tokens + hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model + input_seq_list = self._prepare_input_seq_list(hidden_vec_reshape, target) + + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + input_seq_tensor = torch.cat(input_seq_list, dim=1) # (B*T) x (window_size + BOS) x d_model + pos_target_tensor = self._apply_pos_enc(input_seq_tensor, apply_type='all') # (B*T) x (window_size + BOS) x d_model + for idx, feature in enumerate(self.prediction_order): + output = self.transformer_decoder(pos_target_tensor) + logit = self.hidden2logit[f"layer_{feature}"](output[:, -1:]) + logits_dict[feature] = logit.reshape((1, 1, -1)) # 1 x 1 x vocab_size + sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + sampled_token_dict[feature] = sampled_token + if idx == len(self.prediction_order)-1: + return logits_dict, sampled_token_dict + feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) + feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size + input_seq_list.append(feature_emb_reshape) + input_seq_tensor = torch.cat(input_seq_list, dim=1) + pos_target_tensor = self._apply_pos_enc(input_seq_tensor, apply_type='last') + return logits_dict, sampled_token_dict + + # ---- Training ---- # + # preparing for training + input_seq_tensor = self._prepare_token_embedding_for_teacher_forcing(input_seq_list, target) # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model + pos_target_tensor = self._apply_pos_enc(input_seq_tensor, apply_type='all') # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model + # get output using self-attention + output = self.transformer_decoder(pos_target_tensor) + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + logit = self.hidden2logit[f"layer_{feature}"](output[:, feature_pos, :]) + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_dict[feature] = logit + return logits_dict + +class SelfAttentionUniAudio(SelfAttention): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use) + ''' + Uniaudio version of self-attention sub-decoder + Through the experiments, we found that the performance of the model is better than our proposed self-attention sub-decoder + It shows comparable performance with the cross-attention sub-decoder + However, NMT shows better performance than UniAudio in terms of the performance of the model + ''' + + def _prepare_token_embedding_for_teacher_forcing(self, hidden_vec_reshape, target): + input_seq_list = [] + # append zero vector + input_seq_list.append(torch.zeros(hidden_vec_reshape.shape[0], 1, hidden_vec_reshape.shape[2]).to(self.device)) + for feature in self.prediction_order[:-1]: + feature_idx = self.vocab.feature_list.index(feature) + feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size + feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size + input_seq_list.append(feature_emb_reshape) + + feature_tensor = torch.cat(input_seq_list, dim=1) # (B*T) x num_sub-tokens x d_model + # Ensure hidden_vec_reshape and feature_tensor have the same shape + assert hidden_vec_reshape.shape == feature_tensor.shape, f"Shapes of hidden_vec_reshape and feature_tensor do not match: {hidden_vec_reshape.shape} vs {feature_tensor.shape}" + # Sum hidden_vec_reshape and feature_tensor in the last dimension + memory_tensor = hidden_vec_reshape + feature_tensor + return memory_tensor + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] # B x T x d_model + target = input_dict['target'] # B x T x num_sub-tokens + hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model + hidden_vec_reshape = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub-tokens x d_model + + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + pos_target_tensor = self._apply_pos_enc(hidden_vec_reshape, apply_type='all') # (B*T) x (window_size + BOS) x d_model + for idx, feature in enumerate(self.prediction_order): + output = self.transformer_decoder(pos_target_tensor) + logit = self.hidden2logit[f"layer_{feature}"](output[:, -1:]) + logits_dict[feature] = logit.reshape((1, 1, -1)) # 1 x 1 x vocab_size + sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + sampled_token_dict[feature] = sampled_token + if idx == len(self.prediction_order)-1: + return logits_dict, sampled_token_dict + feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) + feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size + pos_target_tensor = torch.cat([pos_target_tensor[:, :idx+1, :], feature_emb_reshape + pos_target_tensor[:, idx+1:idx+2, :], pos_target_tensor[:, idx+2:, :]], dim=1) + + return logits_dict, sampled_token_dict + + # ---- Training ---- # + # preparing for training + input_seq_tensor = self._prepare_token_embedding_for_teacher_forcing(hidden_vec_reshape, target) # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model + pos_target_tensor = self._apply_pos_enc(input_seq_tensor, apply_type='all') # (B*T) x (window_size + BOS + num_sub_tokens-1) x d_model + # get output using self-attention + output = self.transformer_decoder(pos_target_tensor) + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + logit = self.hidden2logit[f"layer_{feature}"](output[:, feature_pos, :]) + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_dict[feature] = logit + return logits_dict + +class CrossAttention(SubDecoderClass): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + ''' + The power of Cross-attention and UniAudio style Self-attention lies in that using the output of the main decoder or hidden vec directly in the sub-decoder + As the output of the main decoder is the representation of the whole sequence, + it contains richer information which can even decode out sub-tokens in a parallel manner + So both architectures using the output of the main decoder in a direct way show better performance than the original self-attention sub-decoder + ''' + super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use) + self.sub_decoder_enricher_use = sub_decoder_enricher_use + self.feature_order_in_output = {key: (idx-len(prediction_order)) for idx, key in enumerate(prediction_order)} + + self.pos_enc = nn.Embedding(len(self.prediction_order), dim) + nn.init.zeros_(self.pos_enc.weight) + + self.sub_decoder_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True) + if sub_decoder_enricher_use: + self.enricher_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True) + causal_mask = generate_SA_mask(len(prediction_order)) + causl_ca_mask = generate_CA_mask(len(prediction_order), len(prediction_order)).to(self.device) + self.register_buffer('causal_mask', causal_mask) + self.register_buffer('causal_ca_mask', causl_ca_mask) + + if sub_decoder_depth > 1: + self.sub_decoder_layers = nn.Sequential( + *[DecoderLayer(dim=dim, num_heads=heads, dropout=dropout) for _ in range(sub_decoder_depth)] + ) + else: + self.sub_decoder_layers = nn.Sequential(DecoderLayer(dim=dim, num_heads=heads, dropout=dropout)) + if sub_decoder_enricher_use: + self.feature_enricher_layers = nn.Sequential(FeatureEnricher(dim=dim, num_heads=heads, dropout=dropout)) + + def _apply_window_on_hidden_vec(self, hidden_vec): + BOS_emb = self.enricher_BOS_emb.reshape(1,1,-1).repeat(hidden_vec.shape[0]*hidden_vec.shape[1], 1, 1) # (B*T) x 1 x d_model + # through our experiments, we found that the size of the window doesn't affect the performance of the model much + window_size = 1 + zero_vec = torch.zeros((hidden_vec.shape[0], window_size-1, hidden_vec.shape[2])).to(self.device) # B x (window_size-1) x d_model + cat_hidden_vec = torch.cat([zero_vec, hidden_vec], dim=1) # B x (window_size-1+T) x d_model + new_hidden_vec = cat_hidden_vec.unfold(1, window_size, 1).transpose(2, 3) # B x T x window_size x d_model + new_hidden_vec = new_hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], window_size, -1)) # (B*T) x window_size x d_model + new_hidden_vec = torch.cat([BOS_emb, new_hidden_vec], dim=1) # (B*T) x (window_size+1) x d_model + return new_hidden_vec + + def _apply_pos_enc(self, tgt): + pos = torch.arange(tgt.shape[1]).to(tgt.device) # num_sub_tokens + pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) # (B*T) x num_sub_tokens + tgt_pos = tgt + self.pos_enc(pos.long()) # (B*T) x num_sub_tokens x d_model + return tgt_pos + + def _prepare_token_embedding_for_teacher_forcing(self, memory_list, target): + for _, feature in enumerate(self.prediction_order[:-1]): + feature_idx = self.vocab.feature_list.index(feature) + feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size + feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size + memory_list.append(feature_emb_reshape) + memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x (BOS + num_sub_tokens-1) x d_model + return memory_tensor + + def _prepare_memory_list(self, hidden_vec, target=None): + memory_list = [] # used for key and value in cross attention + BOS_emb = self.sub_decoder_BOS_emb.reshape(1,1,-1).repeat(hidden_vec.shape[0]*hidden_vec.shape[1], 1, 1) # (B*T) x 1 x d_model + if target is not None: # training + memory_list.append(BOS_emb) + else: # inference + memory_list.append(BOS_emb[-1:, :, :]) + return memory_list + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] # B x T x d_model + target = input_dict['target'] + + # apply window on hidden_vec for enricher + if self.sub_decoder_enricher_use: + window_applied_hidden_vec = self._apply_window_on_hidden_vec(hidden_vec) # (B*T) x window_size x d_model + hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model + input_seq = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model + input_seq_pos = self._apply_pos_enc(input_seq) + # prepare memory + memory_list = self._prepare_memory_list(hidden_vec=hidden_vec, target=target) + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x 1 x d_model + old_memory_tensor = memory_tensor + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + if self.sub_decoder_enricher_use: + input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec[-1:]} + input_dict = self.feature_enricher_layers(input_dict) + memory_tensor = input_dict['input_seq'] + CA_attn_mask = generate_CA_mask(input_seq_pos.shape[1], memory_tensor.shape[1]).to(self.device) + input_dict = {'input_seq': input_seq_pos[-1:], 'memory': memory_tensor, 'memory_mask': CA_attn_mask} + input_dict = self.sub_decoder_layers(input_dict) + attn_output = input_dict['input_seq'] + logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :]) + logit = logit.reshape((1, 1, -1)) # 1 x 1 x vocab_size + logits_dict[feature] = logit + sampled_token,prob = sample_with_prob(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + sampled_token_dict[feature] = sampled_token + if idx == len(self.prediction_order)-1: + return logits_dict, sampled_token_dict + feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) + feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size + memory_list.append(feature_emb_reshape) + memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x (BOS + idx+1) x d_model + return logits_dict, sampled_token_dict + + # ---- Training ---- # + memory_tensor = self._prepare_token_embedding_for_teacher_forcing(memory_list, target) # (B*T) x (BOS + num_sub_tokens-1) x d_model + # apply feature enricher to memory + if self.sub_decoder_enricher_use: + input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec} + input_dict = self.feature_enricher_layers(input_dict) + memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # implement sub decoder cross attention + input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask} + input_dict = self.sub_decoder_layers(input_dict) + attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # get prob + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :]) + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_dict[feature] = logit + return logits_dict + +class Flatten4Encodec(SubDecoderClass): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool + ): + super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use) + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None): + hidden_vec = input_dict['hidden_vec'] + + # ---- Training ---- # + logits_tensor = torch.zeros(hidden_vec.shape[0], hidden_vec.shape[1], 2049).to(self.device) + for idx, feature_type in enumerate(self.prediction_order): + # ::4 means that we only use the first token in each 4 tokens + # so the chosen tokens will be: 0, 4, 8, 12, ... + # 1::4 means that we only use the second token in each 4 tokens + # so the chosen tokens will be: 1, 5, 9, 13, ... + separated_hidden_vec = hidden_vec[:, idx::4, :] + logit = self.hidden2logit[f"layer_{feature_type}"](separated_hidden_vec) + logits_tensor[:, idx::4, :] = logit + # prob_dict[feature_type] = prob + return logits_tensor + + def run_one_step(self, input_dict, sampling_method=None, threshold=None, temperature=None, feature_type=None): + # ---- Generate(Inference) ---- # + hidden_vec = input_dict['hidden_vec'] + logit = self.hidden2logit[f"layer_{feature_type}"](hidden_vec[:, -1:]) + sampled_token = sample(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + return logit, sampled_token + + +class DiffusionDecoder(SubDecoderClass): + def __init__( + self, + prediction_order:list, + vocab:LangTokenVocab, + sub_decoder_depth:int, + dim:int, + heads:int, + dropout:float, + sub_decoder_enricher_use:bool, + MASK_IDX:int = 126336, + denoising_steps:int = 8, + eps:float = 1e-3, + method:str = 'low-confidence', # or random or auto-regressive + ): + ''' + The power of Cross-attention and UniAudio style Self-attention lies in that using the output of the main decoder or hidden vec directly in the sub-decoder + As the output of the main decoder is the representation of the whole sequence, + it contains richer information which can even decode out sub-tokens in a parallel manner + So both architectures using the output of the main decoder in a direct way show better performance than the original self-attention sub-decoder + ''' + super().__init__(prediction_order, vocab, sub_decoder_depth, dim, heads, dropout, sub_decoder_enricher_use) + self.sub_decoder_enricher_use = sub_decoder_enricher_use + self.feature_order_in_output = {key: (idx-len(prediction_order)) for idx, key in enumerate(prediction_order)} + + self.pos_enc = nn.Embedding(len(self.prediction_order), dim) + nn.init.zeros_(self.pos_enc.weight) + + self.sub_decoder_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True) + self.diffusion_mask_emb = nn.Parameter(torch.empty(dim), requires_grad=True) # embedding of mask token,idx is 126336,which is not in vocab + nn.init.normal_(self.diffusion_mask_emb, mean=0.0, std=0.02) + self.MASK_idx = MASK_IDX + self.denoising_steps = denoising_steps + self.eps = eps + self.method = method + + self.input_norm = nn.LayerNorm(dim) + + self.feature_boost_layers = nn.Sequential(TransformerLayer(dim=dim, num_heads=heads, dropout=dropout)) + + if sub_decoder_enricher_use: + self.enricher_BOS_emb = nn.Parameter(torch.zeros(dim), requires_grad=True) + causal_mask = generate_SA_mask(len(prediction_order)) + causal_ca_mask = generate_none_causality_mask(len(prediction_order), len(prediction_order)).to(self.device) + self.register_buffer('causal_mask', causal_mask) + self.register_buffer('causal_ca_mask', causal_ca_mask) + + # get depth of the sub-decoder + if sub_decoder_depth > 1: + self.sub_decoder_layers = nn.Sequential(*[TransformerLayer(dim=dim, num_heads=heads, dropout=dropout) for _ in range(sub_decoder_depth)]) + else: + self.sub_decoder_layers = nn.Sequential(TransformerLayer(dim=dim, num_heads=heads, dropout=dropout)) + if sub_decoder_enricher_use: + self.feature_enricher_layers = nn.Sequential(FeatureEnricher(dim=dim, num_heads=heads, dropout=dropout)) + + + # simplified version of the forward process in diffusion model + def _forward_process(self, input_ids, eps=1e-3, mask_idx=None): + reshaped_input_ids = torch.reshape(input_ids, (-1, input_ids.shape[-1])) # B*T x num_sub_tokens + b, l = reshaped_input_ids.shape + t = torch.rand(b, device=input_ids.device) + p_mask = (1 - eps) * t + eps + p_mask = p_mask[:, None].repeat(1, l) + + masked_indices = torch.rand((b, l), device=input_ids.device) < p_mask + # 126336 is used for [MASK] token,attention that this token is not in the vocab + if mask_idx is not None: + noisy_batch = torch.where(masked_indices, mask_idx, reshaped_input_ids) + else: + noisy_batch = torch.where(masked_indices, 126336, reshaped_input_ids)# 126336 is used for [MASK] token in + return noisy_batch, masked_indices, p_mask + + + def _apply_window_on_hidden_vec(self, hidden_vec): + BOS_emb = self.enricher_BOS_emb.reshape(1,1,-1).repeat(hidden_vec.shape[0]*hidden_vec.shape[1], 1, 1) # (B*T) x 1 x d_model + # through our experiments, we found that the size of the window doesn't affect the performance of the model much + window_size = 1 + zero_vec = torch.zeros((hidden_vec.shape[0], window_size-1, hidden_vec.shape[2])).to(self.device) # B x (window_size-1) x d_model + cat_hidden_vec = torch.cat([zero_vec, hidden_vec], dim=1) # B x (window_size-1+T) x d_model + new_hidden_vec = cat_hidden_vec.unfold(1, window_size, 1).transpose(2, 3) # B x T x window_size x d_model + new_hidden_vec = new_hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], window_size, -1)) # (B*T) x window_size x d_model + new_hidden_vec = torch.cat([BOS_emb, new_hidden_vec], dim=1) # (B*T) x (window_size+1) x d_model + return new_hidden_vec + + def _apply_pos_enc(self, tgt): + pos = torch.arange(tgt.shape[1]).to(tgt.device) # num_sub_tokens + pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) # (B*T) x num_sub_tokens + tgt_pos = tgt + self.pos_enc(pos.long()) # (B*T) x num_sub_tokens x d_model + return tgt_pos + + def _prepare_token_embedding_for_teacher_forcing(self, memory_list, target): + for _, feature in enumerate(self.prediction_order[:-1]): + feature_idx = self.vocab.feature_list.index(feature) + feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size + feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size + memory_list.append(feature_emb_reshape) + memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x (BOS + num_sub_tokens-1) x d_model + return memory_tensor + + # return a tensor + def _get_noisy_tensor(self, target_shape): + new_target = torch.zeros(target_shape).to(self.device) + # fill all the elements in the tensor with the embedding of the mask token + new_target[:, :, :] = self.diffusion_mask_emb + return new_target + + # prepare the embedding of the target, + def _prepare_embedding(self, memory_list, target): + for _, feature in enumerate(self.prediction_order): + feature_idx = self.vocab.feature_list.index(feature) + feature_emb = self.emb_layer.get_emb_by_key(feature, target[..., feature_idx]) # B x T x emb_size + feature_emb_reshape = feature_emb.reshape((feature_emb.shape[0]*feature_emb.shape[1], 1, -1)) # (B*T) x 1 x emb_size + memory_list.append(feature_emb_reshape) + memory_tensor = torch.cat(memory_list, dim=1) # (B*T) x (BOS + num_sub_tokens) x d_model + return memory_tensor + + + def _prepare_memory_list(self, hidden_vec, target=None, add_BOS=True): + memory_list = [] # used for key and value in cross attention + BOS_emb = self.sub_decoder_BOS_emb.reshape(1,1,-1).repeat(hidden_vec.shape[0]*hidden_vec.shape[1], 1, 1) # (B*T) x 1 x d_model + if add_BOS is true: + if target is not None: # training + memory_list.append(BOS_emb) + else: # inference + memory_list.append(BOS_emb[-1:, :, :]) + else: + pass + return memory_list + + def _get_num_transfer_tokens(self, mask_index, steps): + ''' + In the reverse process, the interval [0, 1] is uniformly discretized into steps intervals. + Furthermore, because LLaDA employs a linear noise schedule (as defined in Eq. (8)), + the expected number of tokens transitioned at each step should be consistent. + + This function is designed to precompute the number of tokens that need to be transitioned at each step. + ''' + mask_num = mask_index.sum(dim=1, keepdim=True) + base = mask_num // steps + remainder = mask_num % steps + + num_transfer_tokens = torch.zeros(mask_num.size(0), steps, device=mask_index.device, dtype=torch.int64) + base + + for i in range(mask_num.size(0)): + num_transfer_tokens[i, :remainder[i]] += 1 + + return num_transfer_tokens + + def sample_from_logits(self, attn_output, hidden_vec, sampling_method=None, threshold=None, temperature=None, force_decode=False,step=None): + sampled_token_dict = {} + logits_dict = {} + candidate_token_embeddings = {} + candidate_token_probs = {} + b,t,d = hidden_vec.shape # B x T x d_model + # print("*"*8) + logits_list = [] + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :]) + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_list.append(logit) + for idx, feature in enumerate(self.prediction_order): + logit = logits_list[idx] # B x T x vocab_siz + sampled_token, prob = sample_with_prob(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + if step==0 and force_decode: + if feature == 'velocity': + sampled_token = torch.tensor([2]).to(logit.device) + prob = torch.tensor([1.0]).to(logit.device) + else: + prob = torch.tensor([0.0]).to(logit.device) + # print(feature, sampled_token, prob) + sampled_token_dict[feature] = sampled_token + logits_dict[feature] = logit + candidate_token_probs[feature] = prob + feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) + feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size + candidate_token_embeddings[feature] = feature_emb_reshape + stacked_logits_probs = torch.stack(list(candidate_token_probs.values()), dim=0).reshape((b*t, -1)) # (B*T) x num_sub_tokens x vocab_size + stacked_token_embeddings = torch.stack(list(candidate_token_embeddings.values()), dim=0).reshape((b*t, -1, d)) # (B*T) x num_sub_tokens x d_model + # print("sampled_token_dict", sampled_token_dict) + return sampled_token_dict, logits_dict, candidate_token_probs, stacked_logits_probs, stacked_token_embeddings + + def sample_from_logits_fast(self, attn_output, hidden_vec, sampling_method=None, threshold=None, temperature=None): + sampled_token_dict = {} + logits_dict = {} + candidate_token_embeddings = {} + candidate_token_probs = {} + + b, t, d = hidden_vec.shape # (B, T, D) + F = len(self.projection_keys) + Vmax = self.max_vocab_size + + # === 1. 取出所有 feature 的位置 === # + feature_pos_list = [self.feature_order_in_output[f] for f in self.projection_keys] + + # === 2. 提取 attn_output 中各 feature 的位置 → (B, F, D) === # + attn_features = torch.stack( + [attn_output[:, pos, :] for pos in feature_pos_list], dim=1 + ) # (B, F, D) + + # === 3. 使用 batch 矩阵乘法:einsum 实现并行 Linear === # + # attn_features: (B, F, D) + # proj_weight: (F, Vmax, D) + # proj_bias: (F, Vmax) + # output: (B, F, Vmax) + logits = torch.einsum("bfd,fvd->bfv", attn_features, self.proj_weight) + self.proj_bias + + # === 4. 按照原始 vocab size 截断每个 feature 的 logits === # + logits_list = [] + logits_dict_by_feature = { + feature: logits[:, i, :self.vocab_sizes[feature]] + for i, feature in enumerate(self.projection_keys) +} + for i, feature in enumerate(self.projection_keys): + vocab_size = self.vocab_sizes[feature] + logits_list.append(logits[:, i, :vocab_size]) # (B, vocab_size) + for idx, feature in enumerate(self.prediction_order): + logit = logits_dict_by_feature[feature].unsqueeze(0) # B x T x vocab_size + sampled_token, prob = sample_with_prob_fast(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + # print(feature, sampled_token, prob) + sampled_token_dict[feature] = sampled_token.squeeze(0) # B x T + logits_dict[feature] = logit + candidate_token_probs[feature] = prob + feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) + feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size + candidate_token_embeddings[feature] = feature_emb_reshape + stacked_logits_probs = torch.stack(list(candidate_token_probs.values()), dim=0).reshape((b*t, -1)) # (B*T) x num_sub_tokens x vocab_size + stacked_token_embeddings = torch.stack(list(candidate_token_embeddings.values()), dim=0).reshape((b*t, -1, d)) # (B*T) x num_sub_tokens x d_model + + return sampled_token_dict, logits_dict, candidate_token_probs, stacked_logits_probs, stacked_token_embeddings + + def choose_tokens(self, hidden_vec, step, method, stacked_logits_probs, num_transfer_tokens): + if method == 'low-confidence': + _, indices = torch.topk(stacked_logits_probs, k=int(num_transfer_tokens[:,step]), dim=-1) + elif method == 'random': + indices = torch.randint(0, stacked_logits_probs.shape[-1], (num_transfer_tokens[:, step],)).to(logit.device) + elif method == 'auto-regressive': + indices = torch.tensor([[step]], device=hidden_vec.device) + return indices + + + def forward_(self, input_dict, sampling_method=None, threshold=None, temperature=None, worst_case=False, validation=False): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] # B x T x d_model + target = input_dict['target'] #B x T x d_model + + + # apply window on hidden_vec for enricher + if self.sub_decoder_enricher_use: + window_applied_hidden_vec = self._apply_window_on_hidden_vec(hidden_vec) # (B*T) x window_size x d_model + hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model + input_seq = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model + input_seq_pos = input_seq + # input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model + # prepare memory + memory_list = self._prepare_memory_list(hidden_vec=hidden_vec, target=target, add_BOS=False) + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + b,t,d = hidden_vec.shape # B x T x d_model + l = len(self.prediction_order) # num_sub_tokens + memory_tensor = self._get_noisy_tensor(target_shape=(b*t, l, d)) + all_noise_tensor = memory_tensor.clone() # (B*T) x num_sub_tokens x d_model + + # indicate the position of the mask token,1 means that the token hsa been masked + masked_history = torch.ones((b*t, l), device=hidden_vec.device, dtype=torch.int64).bool() + num_transfer_tokens = self._get_num_transfer_tokens(masked_history, self.denoising_steps) + # denoising c + stored_logits_dict = {} + stored_probs_dict = {} + for step in range(self.denoising_steps): + # nomalize the memory tensor + # memory_tensor = self.layer_norm(memory_tensor) # (B*T) x num_sub_tokens x d_model + if self.sub_decoder_enricher_use: + input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec} + input_dict = self.feature_enricher_layers(input_dict) + memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask} + # input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask} + input_dict = self.sub_decoder_layers(input_dict) + attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + candidate_token_probs = {} + sampled_token_dict, logits_dict, candidate_token_probs, stacked_logits_probs, stacked_token_embeddings = self.sample_from_logits(attn_output, hidden_vec, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + + # set prob of the changed tokens to -inf + stacked_logits_probs = torch.where(masked_history, stacked_logits_probs, -torch.inf) + # indices = self.choose_tokens(hidden_vec,step, "auto-regressive", stacked_logits_probs, num_transfer_tokens) + indices = self.choose_tokens(hidden_vec, step, self.method, stacked_logits_probs, num_transfer_tokens) + # breakpoint() + # undate the masked history + for i in range(b*t): + for j in range(l): + if j in indices[i]: + masked_history[i][j] = False + stored_logits_dict[self.prediction_order[j]] = logits_dict[self.prediction_order[j]].clone() + stored_probs_dict[self.prediction_order[j]] = candidate_token_probs[self.prediction_order[j]].clone() + expand_masked_history = masked_history.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x num_sub_tokens x d_model + memory_tensor = torch.where(expand_masked_history, all_noise_tensor, stacked_token_embeddings) + # breakpoint() + # print("stored_probs_dict", stored_probs_dict) + # print("sampled_token_dict", sampled_token_dict) + return stored_logits_dict, sampled_token_dict + + # ---- Training ---- # + _, masked_indices, p_mask = self._forward_process(target, mask_idx=self.MASK_idx) # (B*T) x (num_sub_tokens) x d_model + memory_tensor = self._prepare_embedding(memory_list, target) # (B*T) x (num_sub_tokens) x d_model + # apply layer norm + + extend_masked_indices = masked_indices.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x (num_sub_tokens) x d_model + if worst_case: # mask all ,turn into parallel + extend_masked_indices = torch.ones_like(extend_masked_indices).to(self.device) + memory_tensor = torch.where(extend_masked_indices, self.diffusion_mask_emb, memory_tensor) + if self.sub_decoder_enricher_use: + input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec} + input_dict = self.feature_enricher_layers(input_dict) + memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask} + input_dict = self.sub_decoder_layers(input_dict) + attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # get prob + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :]) + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_dict[feature] = logit + return logits_dict, (masked_indices, p_mask) + + def forward_old(self, input_dict, sampling_method=None, threshold=None, temperature=None, worst_case=False, validation=False): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] # B x T x d_model + target = input_dict['target'] #B x T x d_model + bos_hidden_vec = input_dict['bos_token_hidden'] # B x 1 x d_model, used for the first token in the sub-decoder + + # apply window on hidden_vec for enricher + if self.sub_decoder_enricher_use: + window_applied_hidden_vec = self._apply_window_on_hidden_vec(hidden_vec) # (B*T) x window_size x d_model + hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model + input_seq = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model + input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model + + if bos_hidden_vec is None: # start of generation + if target is None: + bos_hidden_vec = input_seq_pos + else: + bos_hidden_vec =hidden_vec[:, 0, :].unsqueeze(1).repeat(1, hidden_vec.shape[1], 1) # B x T x d_model + bos_hidden_vec = bos_hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) + bos_hidden_vec = bos_hidden_vec.repeat(1, len(self.prediction_order), 1) + + else: + bos_hidden_vec = bos_hidden_vec.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model + + # input_seq_pos = input_seq + input_dict = {'input_seq': input_seq_pos, 'memory': bos_hidden_vec, 'memory_mask': self.causal_ca_mask} + boosted_input_dict = self.feature_boost_layers(input_dict) # (B*T) x num_sub_tokens x d_model + input_seq_pos = boosted_input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # input_seq_pos = self.input_norm(input_seq_pos) # (B*T) x num_sub_tokens x d_model + # input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model + # prepare memory + memory_list = self._prepare_memory_list(hidden_vec=hidden_vec, target=target, add_BOS=False) + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + b,t,d = hidden_vec.shape # B x T x d_model + l = len(self.prediction_order) # num_sub_tokens + memory_tensor = self._get_noisy_tensor(target_shape=(b*t, l, d)) + all_noise_tensor = memory_tensor.clone() # (B*T) x num_sub_tokens x d_model + + # indicate the position of the mask token,1 means that the token hsa been masked + masked_history = torch.ones((b*t, l), device=hidden_vec.device, dtype=torch.int64).bool() + num_transfer_tokens = self._get_num_transfer_tokens(masked_history, self.denoising_steps) + # denoising c + stored_logits_dict = {} + stored_probs_dict = {} + for step in range(self.denoising_steps): + memory_tensor = self._apply_pos_enc(memory_tensor) # (B*T) x num_sub_tokens x d_model + # nomalize the memory tensor + # memory_tensor = self.layer_norm(memory_tensor) # (B*T) x num_sub_tokens x d_model + if self.sub_decoder_enricher_use: + input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec} + input_dict = self.feature_enricher_layers(input_dict) + memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask} + input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask} + input_dict = self.sub_decoder_layers(input_dict) + attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + candidate_token_probs = {} + candidate_token_embeddings = {} + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :]) + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_dict[feature] = logit + sampled_token,probs = sample_with_prob(logit, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + # print(idx,feature,sampled_token,probs) + sampled_token_dict[feature] = sampled_token + candidate_token_probs[feature] = probs + feature_emb = self.emb_layer.get_emb_by_key(feature, sampled_token) + feature_emb_reshape = feature_emb.reshape((1, 1, -1)) # (B*T) x 1 x emb_size + candidate_token_embeddings[feature] = feature_emb_reshape + + stacked_logits_probs = torch.stack(list(candidate_token_probs.values()), dim=0).reshape((b*t, l)) # (B*T) x num_sub_tokens x vocab_size + stacked_token_embeddings = torch.stack(list(candidate_token_embeddings.values()), dim=0).reshape((b*t, l, d)) + + # set prob of the changed tokens to -inf + stacked_logits_probs = torch.where(masked_history, stacked_logits_probs, -torch.inf) + + if self.method == 'low-confidence': + _, indices = torch.topk(stacked_logits_probs, k=int(num_transfer_tokens[:,step]), dim=-1) + elif self.method == 'random': + indices = torch.randint(0, stacked_logits_probs.shape[-1], (num_transfer_tokens[:, step],)).to(logit.device) + elif self.method == 'auto-regressive': + indices = torch.tensor([[step]], device=logit.device) + # undate the masked history + for i in range(b*t): + for j in range(l): + if j in indices[i]: + masked_history[i][j] = False + stored_logits_dict[self.prediction_order[j]] = logits_dict[self.prediction_order[j]].clone() + stored_probs_dict[self.prediction_order[j]] = candidate_token_probs[self.prediction_order[j]].clone() + expand_masked_history = masked_history.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x num_sub_tokens x d_model + memory_tensor = torch.where(expand_masked_history, all_noise_tensor, stacked_token_embeddings) + return stored_logits_dict, sampled_token_dict + + # ---- Training ---- # + _, masked_indices, p_mask = self._forward_process(target, mask_idx=self.MASK_idx) # (B*T) x (num_sub_tokens) x d_model + memory_tensor = self._prepare_embedding(memory_list, target) # (B*T) x (num_sub_tokens) x d_model + # apply layer norm + + extend_masked_indices = masked_indices.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x (num_sub_tokens) x d_model + if worst_case: # mask all ,turn into parallel + extend_masked_indices = torch.ones_like(extend_masked_indices).to(self.device) + memory_tensor = torch.where(extend_masked_indices, self.diffusion_mask_emb, memory_tensor) + memory_tensor = self._apply_pos_enc(memory_tensor) # (B*T) x num_sub_tokens x d_model + # all is embedding + # memory_tensor = self.layer_norm(memory_tensor) + # apply feature enricher to memory + if self.sub_decoder_enricher_use: + input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec} + input_dict = self.feature_enricher_layers(input_dict) + memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # implement sub decoder cross attention + # input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask} + # inter_input = torch.cat([input_seq_pos, memory_tensor], dim=1) + # inter_input = input_seq_pos + memory_tensor # (B*T) x num_sub_tokens x d_model + # input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask} + input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask} + input_dict = self.sub_decoder_layers(input_dict) + attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # get prob + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :]) + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_dict[feature] = logit + return logits_dict, (masked_indices, p_mask) + + def forward(self, input_dict, sampling_method=None, threshold=None, temperature=None, Force_decode=False, worst_case=False, validation=False): + logits_dict = {} + hidden_vec = input_dict['hidden_vec'] # B x T x d_model + target = input_dict['target'] #B x T x d_model + bos_hidden_vec = input_dict['bos_token_hidden'] # B x 1 x d_model, used for the first token in the sub-decoder + + # apply window on hidden_vec for enricher + if self.sub_decoder_enricher_use: + window_applied_hidden_vec = self._apply_window_on_hidden_vec(hidden_vec) # (B*T) x window_size x d_model + hidden_vec_reshape = hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) # (B*T) x 1 x d_model + input_seq = hidden_vec_reshape.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model + input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model + + if bos_hidden_vec is None: # start of generation + if target is None: + bos_hidden_vec = input_seq_pos + else: + bos_hidden_vec =hidden_vec[:, 0, :].unsqueeze(1).repeat(1, hidden_vec.shape[1], 1) # B x T x d_model + bos_hidden_vec = bos_hidden_vec.reshape((hidden_vec.shape[0]*hidden_vec.shape[1], 1, -1)) + bos_hidden_vec = bos_hidden_vec.repeat(1, len(self.prediction_order), 1) + + else: + bos_hidden_vec = bos_hidden_vec.repeat(1, len(self.prediction_order), 1) # (B*T) x num_sub_tokens x d_model + + # input_seq_pos = input_seq + input_dict = {'input_seq': input_seq_pos, 'memory': bos_hidden_vec, 'memory_mask': self.causal_ca_mask} + boosted_input_dict = self.feature_boost_layers(input_dict) # (B*T) x num_sub_tokens x d_model + input_seq_pos = boosted_input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # input_seq_pos = self.input_norm(input_seq_pos) # (B*T) x num_sub_tokens x d_model + # input_seq_pos = self._apply_pos_enc(input_seq) # (B*T) x num_sub_tokens x d_model + # prepare memory + memory_list = self._prepare_memory_list(hidden_vec=hidden_vec, target=target, add_BOS=False) + # ---- Generate(Inference) ---- # + if target is None: + sampled_token_dict = {} + b,t,d = hidden_vec.shape # B x T x d_model + l = len(self.prediction_order) # num_sub_tokens + memory_tensor = self._get_noisy_tensor(target_shape=(b*t, l, d)) + all_noise_tensor = memory_tensor.clone() # (B*T) x num_sub_tokens x d_model + + # indicate the position of the mask token,1 means that the token hsa been masked + masked_history = torch.ones((b*t, l), device=hidden_vec.device, dtype=torch.int64).bool() + num_transfer_tokens = self._get_num_transfer_tokens(masked_history, self.denoising_steps) + # denoising c + stored_logits_dict = {} + stored_probs_dict = {} + # with torch.profiler.profile( + # activities=[ + # torch.profiler.ProfilerActivity.CPU, + # torch.profiler.ProfilerActivity.CUDA], + # record_shapes=True, + # profile_memory=True, + # with_stack=True + # ) as prof: + for step in range(self.denoising_steps): + memory_tensor = self._apply_pos_enc(memory_tensor) # (B*T) x num_sub_tokens x d_model + # nomalize the memory tensor + # memory_tensor = self.layer_norm(memory_tensor) # (B*T) x num_sub_tokens x d_model + if self.sub_decoder_enricher_use: + input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec} + input_dict = self.feature_enricher_layers(input_dict) + memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # input_dict = {'input_seq': input_seq_pos, 'memory': memory_tensor, 'memory_mask': self.causal_ca_mask} + input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask} + input_dict = self.sub_decoder_layers(input_dict) + attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + candidate_token_probs = {} + + sampled_token_dict, logits_dict, candidate_token_probs, stacked_logits_probs, stacked_token_embeddings = self.sample_from_logits(attn_output, hidden_vec, sampling_method=sampling_method, threshold=threshold, temperature=temperature, + force_decode=Force_decode, + step=step) + + # set prob of the changed tokens to -inf + stacked_logits_probs = torch.where(masked_history, stacked_logits_probs, -torch.inf) + + if self.method == 'low-confidence': + _, indices = torch.topk(stacked_logits_probs, k=int(num_transfer_tokens[:,step]), dim=-1) + elif self.method == 'random': + indices = torch.randint(0, stacked_logits_probs.shape[-1], (num_transfer_tokens[:, step],)).to(logit.device) + elif self.method == 'auto-regressive': + indices = torch.tensor([[step]], device=logit.device) + # undate the masked history + for i in range(b*t): + for j in range(l): + if j in indices[i]: + masked_history[i][j] = False + stored_logits_dict[self.prediction_order[j]] = logits_dict[self.prediction_order[j]].clone() + expand_masked_history = masked_history.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x num_sub_tokens x d_model + memory_tensor = torch.where(expand_masked_history, all_noise_tensor, stacked_token_embeddings) + # print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10)) + # print(sampled_token_dict) + return stored_logits_dict, sampled_token_dict + + # ---- Training ---- # + _, masked_indices, p_mask = self._forward_process(target, mask_idx=self.MASK_idx) # (B*T) x (num_sub_tokens) x d_model + memory_tensor = self._prepare_embedding(memory_list, target) # (B*T) x (num_sub_tokens) x d_model + # apply layer norm + + extend_masked_indices = masked_indices.unsqueeze(-1).expand(-1, -1, memory_tensor.shape[-1]) # (B*T) x (num_sub_tokens) x d_model + if worst_case: # mask all ,turn into parallel + extend_masked_indices = torch.ones_like(extend_masked_indices).to(self.device) + memory_tensor = torch.where(extend_masked_indices, self.diffusion_mask_emb, memory_tensor) + memory_tensor = self._apply_pos_enc(memory_tensor) # (B*T) x num_sub_tokens x d_model + # all is embedding + # memory_tensor = self.layer_norm(memory_tensor) + # apply feature enricher to memory + if self.sub_decoder_enricher_use: + input_dict = {'input_seq': memory_tensor, 'memory': window_applied_hidden_vec} + input_dict = self.feature_enricher_layers(input_dict) + memory_tensor = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # implement sub decoder cross attention + input_dict = {'input_seq': memory_tensor, 'memory': input_seq_pos, 'memory_mask': self.causal_ca_mask} + input_dict = self.sub_decoder_layers(input_dict) + attn_output = input_dict['input_seq'] # (B*T) x num_sub_tokens x d_model + # get prob + for idx, feature in enumerate(self.prediction_order): + feature_pos = self.feature_order_in_output[feature] + logit = self.hidden2logit[f"layer_{feature}"](attn_output[:, feature_pos, :]) + logit = logit.reshape((hidden_vec.shape[0], hidden_vec.shape[1], -1)) # B x T x vocab_size + logits_dict[feature] = logit + return logits_dict, (masked_indices, p_mask) \ No newline at end of file diff --git a/Amadeus/symbolic_encoding/__init__.py b/Amadeus/symbolic_encoding/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..b115a9d Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-310.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..8963d99 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-311.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-312.pyc b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..abbcb9c Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/__init__.cpython-312.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-310.pyc new file mode 100644 index 0000000..0a0c766 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-310.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-311.pyc new file mode 100644 index 0000000..00368c3 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/augmentor.cpython-311.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-310.pyc new file mode 100644 index 0000000..e966c57 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-310.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-311.pyc new file mode 100644 index 0000000..ed17777 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/compile_utils.cpython-311.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-310.pyc new file mode 100644 index 0000000..e3258ec Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-310.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-311.pyc new file mode 100644 index 0000000..385928a Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/data_utils.cpython-311.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-310.pyc new file mode 100644 index 0000000..5258d8d Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-310.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-311.pyc new file mode 100644 index 0000000..0de9881 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/decoding_utils.cpython-311.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-310.pyc b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-310.pyc new file mode 100644 index 0000000..02944f3 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-310.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-311.pyc b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-311.pyc new file mode 100644 index 0000000..d196e76 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-311.pyc differ diff --git a/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-312.pyc b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-312.pyc new file mode 100644 index 0000000..4b4a5d5 Binary files /dev/null and b/Amadeus/symbolic_encoding/__pycache__/midi2audio.cpython-312.pyc differ diff --git a/Amadeus/symbolic_encoding/anylazesf.py b/Amadeus/symbolic_encoding/anylazesf.py new file mode 100644 index 0000000..803d5e6 --- /dev/null +++ b/Amadeus/symbolic_encoding/anylazesf.py @@ -0,0 +1,46 @@ +from sf2utils.sf2parse import Sf2File + +def print_sorted_presets(sf2_path): + presets_info = [] + + with open(sf2_path, 'rb') as f: + sf2 = Sf2File(f) + + for preset in sf2.presets: + try: + # 尝试直接读取 + name = getattr(preset, 'name', '???').strip('\x00') + bank = getattr(preset, 'bank', None) + program = getattr(preset, 'preset', None) + + # 如果获取不到,再尝试从子属性中取 + if bank is None or program is None: + for attr in dir(preset): + attr_value = getattr(preset, attr) + if hasattr(attr_value, 'bank') and hasattr(attr_value, 'preset'): + bank = attr_value.bank + program = attr_value.preset + name = getattr(attr_value, 'name', name).strip('\x00') + break + + # 收集有效结果 + if bank is not None and program is not None: + presets_info.append((program, bank, name)) + except Exception as e: + print(f"Error reading preset: {e}") + + # 按 program 升序排序(若需要按 bank 再 program,改为 sorted(..., key=lambda x: (x[1], x[0]))) + presets_info.sort(key=lambda x: x[0]) + + # 打印结果 + print(f"{'Program':<8} {'Bank':<6} {'Preset Name'}") + print("-" * 40) + for program, bank, name in presets_info: + print(f"{program:<8} {bank:<6} {name}") + +# DEFAULT_SOUND_FONT = '/data2/suhongju/research/music-generation/sound_file/CrisisGeneralMidi3.01.sf2' +# DEFAULT_SOUND_FONT = '~/.fluidsynth/default_sound_font.sf2' + +# 替换为你的 sf2 文件路径 +sf2_path = "/data2/suhongju/research/music-generation/sound_file/CrisisGeneralMidi3.01.sf2" +print_sorted_presets(sf2_path) \ No newline at end of file diff --git a/Amadeus/symbolic_encoding/augmentor.py b/Amadeus/symbolic_encoding/augmentor.py new file mode 100644 index 0000000..40c1839 --- /dev/null +++ b/Amadeus/symbolic_encoding/augmentor.py @@ -0,0 +1,94 @@ +import random +from typing import Union + +import torch + +class Augmentor: + def __init__( + self, + vocab, + aug_type:Union[str, None], + input_length:int + ): + self.vocab = vocab + self.aug_type = aug_type + self.input_length = input_length + self.feature_list = vocab.feature_list + self.num_features = len(self.feature_list) + self.encoding_scheme = vocab.encoding_scheme + + self.pitch_idx = self.feature_list.index('pitch') + if 'chord' in self.feature_list: + self.chord_idx = self.feature_list.index('chord') + + def _get_shift(self, segment): + # the pitch vocab has ignore token in 0 index + if self.encoding_scheme == 'cp' or self.encoding_scheme == 'nb': + pitch_mask = segment != 0 + pitch_segment = segment[pitch_mask[:,self.pitch_idx], self.pitch_idx] + # check if tensor is empty + if pitch_segment.numel() == 0: + shift = 0 + else: + lowest_pitch = max(12, torch.min(pitch_segment)) + highest_pitch = min(119, torch.max(pitch_segment)) + lower_shift_bound = torch.where(lowest_pitch - torch.arange(6) > 11)[0][-1].item() + upper_shift_bound = torch.where(highest_pitch + torch.arange(7) < 120)[0][-1].item() + shift = random.randint(-lower_shift_bound, upper_shift_bound) + else: # remi + mask_for_pitch = self.vocab.total_mask['pitch'].to(segment.device) + segemnt_pitch_mask = mask_for_pitch[segment] + segment_pitch = segment * segemnt_pitch_mask + segment_pitch = segment_pitch[segment_pitch != 0] + # check if tensor is empty + if segment_pitch.numel() == 0: + shift = 0 + else: + lower_bound = torch.argwhere(mask_for_pitch == 1)[0].item() + upper_bound = torch.argwhere(mask_for_pitch == 1)[-1].item() + lowest_pitch = max(lower_bound, torch.min(segment_pitch)) + highest_pitch = min(upper_bound, torch.max(segment_pitch)) + lower_shift_bound = torch.where(lowest_pitch - torch.arange(6) >= lower_bound)[0][-1].item() + upper_shift_bound = torch.where(highest_pitch + torch.arange(7) <= upper_bound)[0][-1].item() + shift = random.randint(-lower_shift_bound, upper_shift_bound) + return shift + + # TODO: arrange hard coded part + def __call__(self, segment): + ''' + input_tensor is segments of x, y + for transformer_xl, the shape of x, y is [max_num_segments, input_length, num_features] + so we need to change the shape of x, y to [max_num_segments*input_length, num_features] + ''' + if self.aug_type == 'random': + shift = self._get_shift(segment) + if self.encoding_scheme == 'cp' or self.encoding_scheme == 'nb': + # pitch augmentation + segment_pitch_mask = segment != 0 + new_segment = segment.clone() + new_segment[segment_pitch_mask[:,self.pitch_idx], self.pitch_idx] += shift + if 'chord' in self.feature_list: + # chord augmentation + segment_chord_mask = (segment[:,self.chord_idx] != 0) & (segment[:,self.chord_idx] != 1) + new_segment[segment_chord_mask, self.chord_idx] = (((new_segment[segment_chord_mask, self.chord_idx]-2) % 12) + shift ) % 12 + ((new_segment[segment_chord_mask, self.chord_idx]-2) // 12) * 12 + 2 + segment = new_segment + else: # remi + # choose random interger between -5 and 6 + # the augmented results from shift -6 and 6 are same, so we choose -5 and 6 + # pitch augmentation + mask_for_pitch = self.vocab.total_mask['pitch'].to(segment.device) + segment_pitch_mask = mask_for_pitch[segment] + new_segment = segment.clone() + new_segment_valid = (new_segment + shift) * segment_pitch_mask + new_segment = new_segment * (1 - segment_pitch_mask) + new_segment_valid + if 'chord' in self.feature_list: + # chord augmentation + mask_for_chord = self.vocab.total_mask['chord'].clone().to(segment.device) + chord_n_n_idx = torch.argwhere(mask_for_chord == 1)[-1].item() + mask_for_chord[chord_n_n_idx] = 0 + start_idx_chord = self.vocab.remi_vocab_boundaries_by_key['chord'][0] + segment_chord_mask = mask_for_chord[segment] + new_segment_valid = ((((new_segment - start_idx_chord) % 12 + shift) % 12) + ((new_segment - start_idx_chord) // 12) * 12 + start_idx_chord) * segment_chord_mask + new_segment = new_segment * (1 - segment_chord_mask) + new_segment_valid + segment = new_segment + return segment diff --git a/Amadeus/symbolic_encoding/compile_utils.py b/Amadeus/symbolic_encoding/compile_utils.py new file mode 100644 index 0000000..40eae76 --- /dev/null +++ b/Amadeus/symbolic_encoding/compile_utils.py @@ -0,0 +1,207 @@ +import random +from collections import defaultdict + +import torch +import numpy as np +import random + +def reverse_shift_and_pad(tune_in_idx, slice_boundary=4): + new_lst = [curr_elems[:slice_boundary] + next_elems[slice_boundary:] for curr_elems, next_elems in zip(tune_in_idx, tune_in_idx[1:])] + return new_lst + +def reverse_shift_and_pad_for_tensor(tensor, first_pred_feature): + ''' + tensor: [batch_size x seq_len x feature_size] + ''' + if first_pred_feature == 'type': + return tensor + if tensor.shape[-1] == 8: + slice_boundary_dict = {'type':0, 'beat':1, 'chord':2, 'tempo':3, 'instrument':4, 'pitch':5, 'duration':6, 'velocity':7} + elif tensor.shape[-1] == 7: + slice_boundary_dict = {'type':0, 'beat':1, 'chord':2, 'tempo':3, 'pitch':4, 'duration':5, 'velocity':6} + elif tensor.shape[-1] == 5: + slice_boundary_dict = {'type':0, 'beat':1, 'instrument':2, 'pitch':3, 'duration':4} + elif tensor.shape[-1] == 4: + slice_boundary_dict = {'type':0, 'beat':1, 'pitch':2, 'duration':3} + slice_boundary = slice_boundary_dict[first_pred_feature] + new_tensor = torch.zeros_like(tensor) + new_tensor[..., :, :slice_boundary] = tensor[..., :, :slice_boundary] + new_tensor[..., :-1, slice_boundary:] = tensor[..., 1:, slice_boundary:] + return new_tensor + +def shift_and_pad(tune_in_idx, first_pred_feature): + if first_pred_feature == 'type': + return tune_in_idx + if len(tune_in_idx[0]) == 8: + slice_boundary_dict = {'type':0, 'beat':-7, 'chord':-6, 'tempo':-5, 'instrument':-4, 'pitch':-3, 'duration':-2, 'velocity':-1} + elif len(tune_in_idx[0]) == 7: + slice_boundary_dict = {'type':0, 'beat':-6, 'chord':-5, 'tempo':-4, 'pitch':-3, 'duration':-2, 'velocity':-1} + elif len(tune_in_idx[0]) == 5: + slice_boundary_dict = {'type':0, 'beat':-4, 'instrument':-3, 'pitch':-2, 'duration':-1} + elif len(tune_in_idx[0]) == 4: + slice_boundary_dict = {'type':0, 'beat':-3, 'pitch':-2, 'duration':-1} + slice_boundary = slice_boundary_dict[first_pred_feature] + # Add an empty list padded with zeros at the beginning, and sos and eos tokens are not shifted + padded_tune_in_idx = torch.cat([torch.zeros(1, len(tune_in_idx[0]), dtype=torch.long), tune_in_idx], dim=0) + new_tensor = torch.zeros_like(padded_tune_in_idx) + new_tensor[:, slice_boundary:] = padded_tune_in_idx[:, slice_boundary:] + new_tensor[:-1, :slice_boundary] = padded_tune_in_idx[1:, :slice_boundary] + return new_tensor + +class VanillaTransformer_compiler(): + def __init__( + self, + data_list, + augmentor, + eos_token, + input_length, + first_pred_feature, + encoding_scheme + ): + self.data_list = data_list + self.augmentor = augmentor + self.eos_token = eos_token + self.input_length = input_length + self.first_pred_feature = first_pred_feature + self.encoding_scheme = encoding_scheme + + def make_segments(self, data_type): + segments = [] + tune_name2segment = defaultdict(list) + segment2tune_name = [] + num_segments = 0 + for i in range(len(self.data_list)): + tune_in_idx, tune_name = self.data_list[i] + tune_in_idx = torch.LongTensor(tune_in_idx) + if self.encoding_scheme == 'remi' or self.encoding_scheme == 'cp': + eos_token = torch.LongTensor(self.eos_token) + else: + eos_token = torch.LongTensor(self.eos_token) + # shift and pad + tune_in_idx = shift_and_pad(tune_in_idx, self.first_pred_feature) + if data_type == 'train': + if len(tune_in_idx) <= self.input_length+1: + if 'remi' in self.encoding_scheme: + padding_seq = eos_token[0].repeat(self.input_length+1-len(tune_in_idx)) + else: + padding_seq = eos_token.repeat(self.input_length+1-len(tune_in_idx), 1) + mask = torch.cat([torch.ones(len(tune_in_idx), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0) + segment = torch.cat([tune_in_idx, padding_seq], dim=0) + segments.append([segment, mask]) + segment2tune_name.append(tune_name) + else: + start_point = 0 + while start_point + self.input_length+1 < len(tune_in_idx): + mask = torch.ones(self.input_length+1, dtype=torch.long) + segment = tune_in_idx[start_point:start_point + self.input_length+1] + segments.append([segment, mask]) + segment2tune_name.append(tune_name) + assert len(segment) == self.input_length+1 + # Randomly choose the start point for the next segment, which is in the range of half of the current segment to the end of the current segment + start_point += random.randint((self.input_length+1)//2, self.input_length+1) + # if text controled,we only use the first segment + # add the last segment + if len(tune_in_idx[start_point:]) < self.input_length+1: + if 'remi' in self.encoding_scheme: + padding_seq = eos_token[0].repeat(self.input_length+1-len(tune_in_idx[start_point:])) + else: + padding_seq = eos_token.repeat(self.input_length+1-len(tune_in_idx[start_point:]), 1) + mask = torch.cat([torch.ones(len(tune_in_idx[start_point:]), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0) + segment = torch.cat([tune_in_idx[start_point:], padding_seq], dim=0) + segments.append([segment, mask]) + segment2tune_name.append(tune_name) + + + else: # for validset + for i in range(0, len(tune_in_idx), self.input_length+1): + segment = tune_in_idx[i:i+self.input_length+1] + if len(segment) <= self.input_length+1: + if 'remi' in self.encoding_scheme: + padding_seq = eos_token[0].repeat(self.input_length+1-len(segment)) + else: + padding_seq = eos_token.repeat(self.input_length+1-len(segment), 1) + mask = torch.cat([torch.ones(len(segment), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0) + segment = torch.cat([segment, padding_seq], dim=0) + segment2tune_name.append(tune_name) + segments.append([segment, mask]) + num_segments += 1 + tune_name2segment[tune_name].append(num_segments-1) + else: + mask = torch.ones(self.input_length+1, dtype=torch.long) + segments.append([segment, mask]) + segment2tune_name.append(tune_name) + segments.append([segment, mask]) + num_segments += 1 + tune_name2segment[tune_name].append(num_segments-1) + assert len(segment) == self.input_length+1 + + return segments, tune_name2segment, segment2tune_name + + def make_segments_iters(self, data_type): + tune_name2segment = defaultdict(list) + segment2tune_name = [] + num_segments = 0 + # shuffle the data_list + if data_type == 'train': + random.shuffle(self.data_list) + print("length of data_list:", len(self.data_list)) + for i in range(len(self.data_list)): + tune_in_idx, tune_name = self.data_list[i] + tune_in_idx = torch.LongTensor(tune_in_idx) + if self.encoding_scheme == 'remi' or self.encoding_scheme == 'cp': + eos_token = torch.LongTensor(self.eos_token) + else: + eos_token = torch.LongTensor(self.eos_token) + # shift and pad + tune_in_idx = shift_and_pad(tune_in_idx, self.first_pred_feature) + if data_type == 'train': + if len(tune_in_idx) <= self.input_length+1: + if 'remi' in self.encoding_scheme: + padding_seq = eos_token[0].repeat(self.input_length+1-len(tune_in_idx)) + else: + padding_seq = eos_token.repeat(self.input_length+1-len(tune_in_idx), 1) + mask = torch.cat([torch.ones(len(tune_in_idx), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0) + segment = torch.cat([tune_in_idx, padding_seq], dim=0) + segment2tune_name.append(tune_name) + yield [segment, mask], tune_name2segment, segment2tune_name + else: + start_point = 0 + while start_point + self.input_length+1 < len(tune_in_idx): + mask = torch.ones(self.input_length+1, dtype=torch.long) + segment = tune_in_idx[start_point:start_point + self.input_length+1] + segment2tune_name.append(tune_name) + yield [segment, mask], tune_name2segment, segment2tune_name + assert len(segment) == self.input_length+1 + start_point += random.randint((self.input_length+1)//2, self.input_length+1) + # break + if len(tune_in_idx[start_point:]) < self.input_length+1: + if 'remi' in self.encoding_scheme: + padding_seq = eos_token[0].repeat(self.input_length+1-len(tune_in_idx[start_point:])) + else: + padding_seq = eos_token.repeat(self.input_length+1-len(tune_in_idx[start_point:]), 1) + mask = torch.cat([torch.ones(len(tune_in_idx[start_point:]), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0) + segment = torch.cat([tune_in_idx[start_point:], padding_seq], dim=0) + segment2tune_name.append(tune_name) + yield [segment, mask], tune_name2segment, segment2tune_name + else: # for validset + for i in range(0, len(tune_in_idx), self.input_length+1): + segment = tune_in_idx[i:i+self.input_length+1] + if len(segment) <= self.input_length+1: + if 'remi' in self.encoding_scheme: + padding_seq = eos_token[0].repeat(self.input_length+1-len(segment)) + else: + padding_seq = eos_token.repeat(self.input_length+1-len(segment), 1) + mask = torch.cat([torch.ones(len(segment), dtype=torch.long), torch.zeros(len(padding_seq), dtype=torch.long)], dim=0) + segment = torch.cat([segment, padding_seq], dim=0) + segment2tune_name.append(tune_name) + num_segments += 1 + tune_name2segment[tune_name].append(num_segments-1) + yield [segment, mask], tune_name2segment, segment2tune_name + else: + mask = torch.ones(self.input_length+1, dtype=torch.long) + segment2tune_name.append(tune_name) + num_segments += 1 + tune_name2segment[tune_name].append(num_segments-1) + yield [segment, mask], tune_name2segment, segment2tune_name + assert len(segment) == self.input_length+1 + diff --git a/Amadeus/symbolic_encoding/data_utils.py b/Amadeus/symbolic_encoding/data_utils.py new file mode 100644 index 0000000..2400d36 --- /dev/null +++ b/Amadeus/symbolic_encoding/data_utils.py @@ -0,0 +1,1610 @@ +import re +import random +from pathlib import Path +from collections import OrderedDict +from typing import Union, List, Tuple, Dict + +import numpy as np +import matplotlib.pyplot as plt +# lock of thread +from threading import Lock + +import json +from tqdm import tqdm +from torch.utils.data import Dataset,IterableDataset +from transformers import T5Tokenizer + +from .augmentor import Augmentor +from .compile_utils import VanillaTransformer_compiler +from data_representation import vocab_utils + +def get_emb_total_size(config, vocab): + emb_param = config.nn_params.emb + total_size = 0 + for feature in vocab.feature_list: + size = int(emb_param[feature] * emb_param.emb_size) + total_size += size + emb_param[feature] = size + emb_param.total_size = total_size + config.nn_params.emb = emb_param + return config + +class TuneCompiler(Dataset): + def __init__( + self, + data:List[Tuple[np.ndarray, str]], + data_type:str, + augmentor:Augmentor, + vocab:vocab_utils.LangTokenVocab, + input_length:int, + first_pred_feature:str, + caption_path:Union[str, None] = None, + for_evaluation: bool = False + ): + ''' + The data is distributed on-the-fly by the TuneCompiler + Pitch, Chord augementation is applied to the training data every iteration + Segmentation is applied every epoch for the training data + ''' + super().__init__() + self.data_list = data + self.data_type = data_type + self.augmentor = augmentor + self.eos_token = vocab.eos_token + self.compile_function = VanillaTransformer_compiler( + data_list=self.data_list, + augmentor=self.augmentor, + eos_token=self.eos_token, + input_length=input_length, + first_pred_feature=first_pred_feature, + encoding_scheme=vocab.encoding_scheme + ) + self.segment2tune_name = None + self.tune_name2segment = None + self.t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large",legacy=False) # Initialize T5 tokenizer for caption processing + + + if self.data_type == 'valid' or self.data_type == 'test': + self._update_segments_for_validset() + else: + self._update_segments_for_trainset() + + def _update_segments_for_trainset(self, random_seed=0): + random.seed(random_seed) + if self.segment2tune_name is not None: + # If segments are already compiled, we can skip the compilation + print("Segments are already compiled, skipping compilation") + return + print("Compiling segments for training data") + with Lock(): + self.segments, _, self.segment2tune_name = self.compile_function.make_segments(self.data_type) + print(f"number of trainset segments: {len(self.segments)}") + + def _update_segments_for_validset(self, random_seed=0): + random.seed(random_seed) + with Lock(): + self.segments, self.tune_name2segment, self.segment2tune_name = self.compile_function.make_segments(self.data_type) + print(f"number of testset segments: {len(self.segments)}") + + def __getitem__(self, idx): + segment, tensor_mask = self.segments[idx] + tune_name = self.segment2tune_name[idx] + try: + encoded_caption = self.t5_tokenizer(tune_name, return_tensors='pt', padding='max_length', truncation=True, max_length=128) + except Exception as e: + print(f"Error encoding caption for tune {tune_name}: {e}") + encoded_caption = self.t5_tokenizer("No caption available", return_tensors='pt', padding='max_length', truncation=True, max_length=128) + return segment, tensor_mask, tune_name, encoded_caption + if self.data_type == 'train': + augmented_segment = self.augmentor(segment) + return augmented_segment, tensor_mask, tune_name, encoded_caption + else: + return segment, tensor_mask, tune_name, encoded_caption + + def get_segments_with_tune_idx(self, tune_name, seg_order): + ''' + This function is used to retrieve the segment with the tune name and segment order during the validation + ''' + segments_list = self.tune_name2segment[tune_name] + segment_idx = segments_list[seg_order] + segment, mask = self.segments[segment_idx][0], self.segments[segment_idx][1] + return segment, mask + + def __len__(self): + return len(self.segments) + +class IterTuneCompiler(IterableDataset): + def __init__( + self, + data: List[Tuple[np.ndarray, str]], + data_type: str, + augmentor: Augmentor, + vocab: vocab_utils.LangTokenVocab, + input_length: int, + first_pred_feature: str, + caption_path: Union[str, None] = None, + for_evaluation: bool = False + ): + ''' + The data is distributed on-the-fly by the IterTuneCompiler. + Pitch, Chord augmentation is applied to the training data every iteration. + Segmentation is applied every epoch for the training data. + ''' + super().__init__() + self.data_list = data + self.data_type = data_type + self.augmentor = augmentor + self.eos_token = vocab.eos_token + self.compile_function = VanillaTransformer_compiler( + data_list=self.data_list, + augmentor=self.augmentor, + eos_token=self.eos_token, + input_length=input_length, + first_pred_feature=first_pred_feature, + encoding_scheme=vocab.encoding_scheme + ) + self.t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base", legacy=False) + self.random_seed = 0 + + def __iter__(self): + # This will yield ([segment, mask], tune_name2segment, segment2tune_name) + generator = self.compile_function.make_segments_iters(self.data_type) + for ([segment, mask], tune_name2segment, segment2tune_name) in generator: + # print(len(segment2tune_name), len(tune_name2segment)) + tune_name = segment2tune_name[-1] # Get the last tune name from the segment2tune_name list + # print(f"Processing tune: {tune_name}") + try: + encoded_caption = self.t5_tokenizer(tune_name, return_tensors='pt', padding='max_length', truncation=True, max_length=128) + except Exception as e: + encoded_caption = self.t5_tokenizer("No caption available", return_tensors='pt', padding='max_length', truncation=True, max_length=128) + if self.data_type == 'train': + segment = self.augmentor(segment) + # use input_ids replace tune_name + tune_name = encoded_caption['input_ids'][0] # Use the input_ids from the encoded caption + yield segment, mask, tune_name, encoded_caption + + def __len__(self): + # If you want to use __len__, you need to know the number of segments in advance. + # Otherwise, you can raise an exception or return a default value. + raise NotImplementedError("IterTuneCompiler is an iterable dataset and does not support __len__.") + +class SymbolicMusicDataset(Dataset): + def __init__( + self, + vocab: vocab_utils.LangTokenVocab, + encoding_scheme: str, + num_features: int, + debug: bool, + aug_type: Union[str, None], + input_length: int, + first_pred_feature: str, + caption_path: Union[str, None] = None, + for_evaluation: bool = False + ): + ''' + The vocabulary containing token representations for the dataset + The encoding scheme used for representing symbolic music (e.g., REMI, NB, etc.) + The number of features used for the dataset + Debug mode; limits dataset size for faster testing if enabled + Type of data augmentation to apply, if 'random' the compiler will apply pitch and chord augmentation + Length of the input sequence for each sample + Feature to predict first which is used for compound shift for NB, if not shift, 'type' is used + ''' + super().__init__() + # Initializing instance variables + self.encoding_scheme = encoding_scheme + self.num_features = num_features + self.debug = debug + self.input_length = input_length + self.first_pred_feature = first_pred_feature + self.caption_path = caption_path + self.for_evaluation = for_evaluation + + # Load the vocabulary passed into the constructor + self.vocab = vocab + + # Initialize augmentor for data augmentation + self.augmentor = Augmentor(vocab=self.vocab, aug_type=aug_type, input_length=input_length) + + # Load preprocessed tune indices + if self.for_evaluation: + # For evaluation, we load the tune indices without any augmentation + self.tune_in_idx, self.len_tunes, self.file_name_list = [], [], [] + else: + self.tune_in_idx, self.len_tunes, self.file_name_list = self._load_tune_in_idx() + # Plot the histogram of tune lengths for analysis + dataset_name = self.__class__.__name__ # Get the class name (dataset name) + len_dir_path = Path(f"len_tunes/{dataset_name}") # Directory to store tune length histograms + len_dir_path.mkdir(parents=True, exist_ok=True) # Create directory if it doesn't exist + if self. for_evaluation is False: + self._plot_hist(self.len_tunes, len_dir_path / f"len_{encoding_scheme}{num_features}.png") + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + # Load preprocessed tune indices from .npz files + print("preprocessed tune_in_idx data is being loaded") + + # List of files containing tune index data + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + + # If debug mode is enabled, limit the number of loaded files + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + + # Initialize dictionaries and lists for storing tune index data, tune lengths, and file names + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # Load tune index data from each .npz file + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] # Load the numpy array from the file + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx # Store the tune indices in the dictionary + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) # Record the length of the tune + file_name_list.append(tune_in_idx_file.stem) # Append the file name (without extension) + + return tune_in_idx_dict, len_tunes, file_name_list # Return the data structures + + def _plot_hist(self, len_tunes, path_outfile): + # Plot histogram of tune lengths and save the plot + Path(path_outfile).parent.mkdir(parents=True, exist_ok=True) # Ensure the directory for the plot exists + + # Convert tune lengths to a NumPy array + data = np.array(list(len_tunes.values())) + + # Compute mean and standard deviation of tune lengths + self.mean_len_tunes = np.mean(data) + data_mean = np.mean(data) + data_std = np.std(data) + + # cumpute the total length of all tunes + self.total_len_tunes = np.sum(data) + + # Plot the histogram + plt.figure(dpi=100) + plt.hist(data, bins=50) + plt.title(f"mean: {data_mean:.2f}, std: {data_std:.2f}, total: {self.total_len_tunes}, num_tunes: {len(data)}") + plt.savefig(path_outfile) # Save the plot to file + plt.close() # Close the plot to free memory + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + # Split the dataset into train, validation, and test sets based on the given ratio + shuffled_tune_names = list(self.tune_in_idx.keys()) # Get the list of all tune names + random.seed(seed) # Set the seed for reproducibility + random.shuffle(shuffled_tune_names) # Shuffle the tune names + + # Compute the number of training, validation, and test samples + num_train = int(len(shuffled_tune_names) * ratio) + num_valid = int(len(shuffled_tune_names) * (1 - ratio) / 2) + + # Split the tune names into training, validation, and test sets + train_names = shuffled_tune_names[:num_train] + valid_names = shuffled_tune_names[num_train:num_train + num_valid] + test_names = shuffled_tune_names[num_train + num_valid:] + + return train_names, valid_names, test_names, shuffled_tune_names # Return the split lists + + def split_train_valid_test_set(self, dataset_name=None, ratio=None, seed=42, save_dir=None, for_evaluation: bool = False): + # Split the dataset into train, validation, and test sets or load an existing split + if not Path(f"metadata/{dataset_name}_caption_metadata.json").exists(): + # If no metadata exists, perform a random split and save metadata + assert ratio is not None, "ratio should be given when you make metadata for split" + + # Perform the split + train_names, valid_names, test_names, shuffled_tune_names = self._get_split_list_from_tune_in_idx(ratio, seed) + + # Log the split information + print(f"Randomly split train and test set using seed {seed}") + out_dict = {'shuffle_seed': seed, # Seed used for shuffling + 'shuffled_names': shuffled_tune_names, # Shuffled list of tune names + 'train': train_names, # Training set names + 'valid': valid_names, # Validation set names + 'test': test_names} # Test set names + + # Save the split metadata to a JSON file + with open(f"metadata/{dataset_name}_caption_metadata.json", "w") as f: + json.dump(out_dict, f, indent=2) + else: + # If metadata already exists, load it + with open(f"metadata/{dataset_name}_caption_metadata.json", "r") as f: + out_dict = json.load(f) + + # Ensure that the loaded data matches the current dataset + train_names, valid_names, test_names = out_dict['train'], out_dict['valid'], out_dict['test'] + if self.for_evaluation is False: + assert set(out_dict['shuffled_names']) == set(self.tune_in_idx.keys()), "Loaded data is not matched with the recorded metadata" + + # Prepare training, validation, and test datasets using the TuneCompiler + if self.for_evaluation: + # For evaluation, we do not need to create train and valid datasets + train_data = [] + valid_data = [] + self.test_data = [] + else: + train_data = [(self.tune_in_idx[tune_name], tune_name) for tune_name in train_names] + valid_data = [(self.tune_in_idx[tune_name], tune_name) for tune_name in valid_names] + self.test_data = [(self.tune_in_idx[tune_name], tune_name) for tune_name in test_names] + + # Initialize TuneCompiler objects for each split + # if self.for_evaluation: + # train_dataset = None # No training dataset for evaluation + # valid_dataset = None + # test_dataset = TuneCompiler(data=self.test_data, data_type='test', augmentor=self.augmentor, vocab=self.vocab, input_length=self.input_length, first_pred_feature=self.first_pred_feature) + # else: + train_dataset = IterTuneCompiler(data=train_data, data_type='train', augmentor=self.augmentor, vocab=self.vocab, input_length=self.input_length, first_pred_feature=self.first_pred_feature) + valid_dataset = TuneCompiler(data=valid_data, data_type='valid', augmentor=self.augmentor, vocab=self.vocab, input_length=self.input_length, first_pred_feature=self.first_pred_feature) + test_dataset = TuneCompiler(data=self.test_data, data_type='test', augmentor=self.augmentor, vocab=self.vocab, input_length=self.input_length, first_pred_feature=self.first_pred_feature) + + # Save metadata to a directory if specified + if save_dir is not None: + Path(save_dir).mkdir(parents=True, exist_ok=True) + with open(Path(save_dir) / f"{dataset_name}_metadata.json", "w") as f: + json.dump(out_dict, f, indent=2) + + # Return the datasets for training, validation, and testing + return train_dataset, valid_dataset, test_dataset + +class Pop1k7(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + +class SymphonyMIDI(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + +class LakhClean(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + if tune_in_idx_file.stem in irregular_tunes: + continue + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +class LakhClean(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + if tune_in_idx_file.stem in irregular_tunes: + continue + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +class ariamidi(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + if tune_in_idx_file.stem in irregular_tunes: + continue + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +class gigamidi(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + if tune_in_idx_file.stem in irregular_tunes: + continue + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +class PretrainingDataset(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx_aria(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Load preprocessed tune indices for the aria dataset + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_ariamidi/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + def _load_tune_in_idx_giga(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Load preprocessed tune indices for the gigamidi dataset + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_gigamidi/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + if "drums-only" in tune_in_idx_file.stem: + print(f"skipping {tune_in_idx_file.stem} as it is a drums-only file") + continue + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + def _load_tune_in_idx_pop1k7(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Load preprocessed tune indices for the Pop1k7 dataset + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_pop1k7/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + def _load_tune_in_idx_sod(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Load preprocessed tune indices for the SOD dataset + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_SOD/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + def _load_tune_in_idx_lakh(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_LakhALLFined/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.lakh_caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + location2caption[item["location"]] = item["caption"] + except Exception: + continue + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid" + location_key = f"lmd_full/{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _load_tune_in_idx_xmidi(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_XMIDI_Dataset/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.xmidi_caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem + ".midi" + location_key = f"{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _load_tune_in_idx_new(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_new_dataset/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.new_caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem + ".mid" + location_key = f"new_data_new_dataset/{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + self.lakh_caption_path = "dataset/represented_data/tuneidx/train_set.json" + self.xmidi_caption_path = "dataset/represented_data/tuneidx/all_captions.json" + self.new_caption_path = "dataset/represented_data/tuneidx/new_dataset_captions_final.jsonl" + + # load all tune_in_idx data from aria, giga datasets + tune_in_idx_giga, len_tunes_giga, file_name_list_giga = self._load_tune_in_idx_giga() + tune_in_idx_aria, len_tunes_aria, file_name_list_aria = self._load_tune_in_idx_aria() + tune_in_idx_lakh, len_tunes_lakh, file_name_list_lakh = self._load_tune_in_idx_lakh() + tune_in_idx_xmidi, len_tunes_xmidi, file_name_list_xmidi = self._load_tune_in_idx_xmidi() + tune_in_idx_new, len_tunes_new, file_name_list_new = self._load_tune_in_idx_new() + + # merge the two datasets + tune_in_idx = {**tune_in_idx_aria, **tune_in_idx_giga, **tune_in_idx_lakh, **tune_in_idx_xmidi, **tune_in_idx_new} + len_tunes = {**len_tunes_aria, **len_tunes_giga, **len_tunes_lakh, **len_tunes_xmidi, **len_tunes_new} + file_name_list = file_name_list_aria + file_name_list_giga + file_name_list_lakh + file_name_list_xmidi + file_name_list_new + print(f"number of loaded tunes: {len(tune_in_idx)}") + return tune_in_idx, len_tunes, file_name_list + + +class SOD(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + with open("metadata/SOD_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + if tune_in_idx_file.stem in irregular_tunes: + continue + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[tune_in_idx_file.stem] = tune_in_idx + len_tunes[tune_in_idx_file.stem] = len(tune_in_idx) + file_name_list.append(tune_in_idx_file.stem) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + +class BachChorale(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + +class Pop909(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Pop909 dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"-v\d+$", "", tune) for tune in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + + +class LakhALL(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + # # remove file in tune_in_idx_list + # location2caption[item["location"]] = "test_set" + # continue + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid" + location_key = f"lmd_full/{location_key}" + try: + caption = location2caption.get(location_key, None) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + print(f"Caption for {location_key} is None, skipping this tune") + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +class LakhALLFined(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + # if item["test_set"] is True: + # continue # skip test set tunes + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid" + location_key = f"lmd_full/{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + # filter out none in tune_in_idx + print("length of tune_in_idx before filtering:", len(self.tune_in_idx)) + try: + self.tune_in_idx = {k: v for k, v in self.tune_in_idx.items() if v is not None} + except: + print("Error filtering None values in tune_in_idx, skipping filtering") + return [], [], [], [] + print("length of tune_in_idx after filtering:", len(self.tune_in_idx)) + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +class XMIDI_Dataset(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + # if item["test_set"] is True: + # continue # skip test set tunes + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem + ".midi" + print(f"Processing file: {tune_in_idx_file.stem}, location_key: {location_key}") + location_key = f"{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + # filter out none in tune_in_idx + print("length of tune_in_idx before filtering:", len(self.tune_in_idx)) + self.tune_in_idx = {k: v for k, v in self.tune_in_idx.items() if v is not None} + print("length of tune_in_idx after filtering:", len(self.tune_in_idx)) + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +class new_dataset(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + # if item["test_set"] is True: + # continue # skip test set tunes + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem.split("/")[-1] + ".mid" + location_key = f"new_data_new_dataset/{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + # filter out none in tune_in_idx + print("length of tune_in_idx before filtering:", len(self.tune_in_idx)) + self.tune_in_idx = {k: v for k, v in self.tune_in_idx.items() if v is not None} + print("length of tune_in_idx after filtering:", len(self.tune_in_idx)) + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +class SymphonyNet_Dataset(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path) + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_{self.__class__.__name__}/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid" + location_key = f"/data2/suhongju/research/music-generation/BandZero/SymphonyNet_Dataset/{location_key}" + try: + caption = location2caption.get(location_key, None) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + print(f"Caption for {location_key} is None, skipping this tune") + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = shuffled_tune_names + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + +# use lakhAllFined, XMIDI_dataset, new_dataset, as finetune dataset +class FinetuneDataset(SymbolicMusicDataset): + def __init__(self, vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path=None, + for_evaluation: bool = False): + super().__init__(vocab, encoding_scheme, num_features, debug, aug_type, input_length, first_pred_feature, caption_path, + for_evaluation=for_evaluation) + + + def _load_tune_in_idx_lakh(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_LakhALLFined/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.lakh_caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + location2caption[item["location"]] = item["caption"] + except Exception: + continue + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem.replace("_", "/", 1) + ".mid" + location_key = f"lmd_full/{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _load_tune_in_idx_xmidi(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_XMIDI_Dataset/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.xmidi_caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem + ".midi" + location_key = f"{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _load_tune_in_idx_new(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Irregular tunes are removed from the dataset for better generation quality + It includes tunes that are not quantized properly, mostly theay are expressive performance data + ''' + print("preprocessed tune_in_idx data is being loaded") + tune_in_idx_list = sorted(list(Path(f"dataset/represented_data/tuneidx/tuneidx_new_dataset/{self.encoding_scheme}{self.num_features}").rglob("*.npz"))) + if self.debug: + tune_in_idx_list = tune_in_idx_list[:5000] + tune_in_idx_dict = OrderedDict() + len_tunes = OrderedDict() + file_name_list = [] + + # load caption + self.caption_list = [] + with open(self.new_caption_path, "r") as f: + # every line is a caption for the tune + for line in f: + self.caption_list.append(line.strip()) + print(f"number of loaded captions: {len(self.caption_list)}") + + with open("metadata/LakhClean_irregular_tunes.json", "r") as f: + irregular_tunes = json.load(f) + + # 构建 location 到 caption 的映射 + location2caption = {} + for line in self.caption_list: + try: + # 假设每行是一个json字符串 + item = json.loads(line) + location2caption[item["location"]] = item["caption"] + except Exception: + continue + + for tune_in_idx_file in tqdm(tune_in_idx_list, total=len(tune_in_idx_list)): + # 0_06d3f5a5954848ba13b9128f68f0a1d1 -> 0/06d3f5a5954848ba13b9128f68f0a1d1 + location_key = tune_in_idx_file.stem + ".mid" + location_key = f"new_data_new_dataset/{location_key}" + try: + caption = location2caption.get(location_key) + except KeyError: + print(f"KeyError: {location_key} not found in location2caption") + continue + if caption is None: + continue + # print(tune_in_idx_file.stem, location_key, caption) + # print("*" * 20) + # 你可以在这里使用caption变量 + tune_in_idx = np.load(tune_in_idx_file)['arr_0'] + tune_in_idx_dict[caption] = tune_in_idx + len_tunes[caption] = len(tune_in_idx) + file_name_list.append(caption) + print(f"number of loaded tunes: {len(tune_in_idx_dict)}") + return tune_in_idx_dict, len_tunes, file_name_list + + def _load_tune_in_idx(self) -> Tuple[Dict[str, np.ndarray], Dict[str, int], List[str]]: + ''' + Load tune_in_idx from all three datasets + ''' + self.lakh_caption_path = "dataset/represented_data/tuneidx/train_set.json" + self.xmidi_caption_path = "dataset/represented_data/tuneidx/all_captions.json" + self.new_caption_path = "dataset/represented_data/tuneidx/new_dataset_captions_final.jsonl" + + tune_in_idx_lakh, len_tunes_lakh, file_name_list_lakh = self._load_tune_in_idx_lakh() + tune_in_idx_xmidi, len_tunes_xmidi, file_name_list_xmidi = self._load_tune_in_idx_xmidi() + tune_in_idx_new, len_tunes_new, file_name_list_new = self._load_tune_in_idx_new() + # 合并三个数据集 + tune_in_idx = {**tune_in_idx_lakh, **tune_in_idx_xmidi, **tune_in_idx_new} + len_tunes = {**len_tunes_lakh, **len_tunes_xmidi, **len_tunes_new} + file_name_list = file_name_list_lakh + file_name_list_xmidi + file_name_list_new + print(f"number of loaded tunes: {len(tune_in_idx)}") + return tune_in_idx, len_tunes, file_name_list + + def _get_split_list_from_tune_in_idx(self, ratio, seed): + ''' + As Lakh dataset contains multiple versions of the same song, we split the dataset based on the song name + ''' + # filter out none in tune_in_idx + print("length of tune_in_idx before filtering:", len(self.tune_in_idx)) + try: + self.tune_in_idx = {k: v for k, v in self.tune_in_idx.items() if v is not None} + except: + print("Error filtering None values in tune_in_idx, skipping filtering") + return [], [], [], [] + print("length of tune_in_idx after filtering:", len(self.tune_in_idx)) + shuffled_tune_names = list(self.tune_in_idx.keys()) + song_names_without_version = [re.sub(r"\.\d+$", "", song) for song in shuffled_tune_names] + song_dict = {} + for song, orig_song in zip(song_names_without_version, shuffled_tune_names): + if song not in song_dict: + song_dict[song] = [] + song_dict[song].append(orig_song) + unique_song_names = list(song_dict.keys()) + random.seed(seed) + random.shuffle(unique_song_names) + num_train = int(len(unique_song_names)*ratio) + num_valid = int(len(unique_song_names)*(1-ratio)/2) + train_names = [] + valid_names = [] + test_names = [] + for song_name in unique_song_names[:num_train]: + train_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train:num_train+num_valid]: + valid_names.extend(song_dict[song_name]) + for song_name in unique_song_names[num_train+num_valid:]: + test_names.extend(song_dict[song_name]) + return train_names, valid_names, test_names, shuffled_tune_names + \ No newline at end of file diff --git a/Amadeus/symbolic_encoding/decoding_utils.py b/Amadeus/symbolic_encoding/decoding_utils.py new file mode 100644 index 0000000..99312a3 --- /dev/null +++ b/Amadeus/symbolic_encoding/decoding_utils.py @@ -0,0 +1,404 @@ +import os, sys +from pathlib import Path + +import matplotlib.pyplot as plt +from collections import defaultdict + +from music21 import converter +import muspy +import miditoolkit +from miditoolkit.midi.containers import Marker, Instrument, TempoChange, Note, TimeSignature + +from .midi2audio import FluidSynth +from data_representation.constants import PROGRAM_INSTRUMENT_MAP + +class MuteWarn: + def __enter__(self): + self._init_stdout = sys.stdout + sys.stdout = open(os.devnull, "w") + + def __exit__(self, exc_type, exc_val, exc_tb): + sys.stdout.close() + sys.stdout = self._init_stdout + +def save_score_image_from_midi(midi_fn, file_name): + assert isinstance(midi_fn, str) + with MuteWarn(): + convert = converter.parse(midi_fn) + convert.write('musicxml.png', fp=file_name) + +def save_pianoroll_image_from_midi(midi_fn, file_name): + assert isinstance(midi_fn, str) + midi_obj_muspy = muspy.read_midi(midi_fn) + midi_obj_muspy.show_pianoroll(track_label='program', preset='frame') + plt.gcf().set_size_inches(20, 10) + plt.savefig(file_name) + plt.close() + +def save_wav_from_midi(midi_fn, file_name, qpm=80): + assert isinstance(midi_fn, str) + with MuteWarn(): + music = muspy.read_midi(midi_fn) + music.tempos[0].qpm = qpm + music.write_audio(file_name, rate=44100, gain=3) + +def save_wav_from_midi_fluidsynth(midi_fn, file_name, gain=3): + assert isinstance(midi_fn, str) + fs = FluidSynth(gain=gain) + fs.midi_to_audio(midi_fn, file_name) + +class MidiDecoder4REMI: + def __init__( + self, + vocab, + in_beat_resolution, + dataset_name + ): + self.vocab = vocab + self.in_beat_resolution = in_beat_resolution + self.dataset_name = dataset_name + if dataset_name == 'SymphonyMIDI': + self.gain = 0.7 + elif dataset_name == 'SOD' or dataset_name == 'LakhClean': + self.gain = 1.1 + elif dataset_name == 'Pop1k7' or dataset_name == 'Pop909': + self.gain = 2.5 + else: + self.gain = 1.5 + + def __call__(self, generated_output, output_path=None): + ''' + generated_output: list of tensor, the tensor + ''' + idx2event = self.vocab.idx2event + if generated_output.dim() == 2: + generated_output = generated_output.squeeze(0) + events = [idx2event[token.item()] for token in generated_output] + + midi_obj = miditoolkit.midi.parser.MidiFile() + if 'tempo' not in idx2event.keys(): + default_tempo = 95 + midi_obj.tempo_changes.append( + TempoChange(tempo=default_tempo, time=0)) + default_ticks_per_beat = 480 + default_in_beat_ticks = 480 // self.in_beat_resolution + cur_pos = 0 + bar_pos = 0 + cur_bar_resol = 0 + beat_pos = 0 + cur_instr = 0 if not self.dataset_name == 'BachChorale' else 53 + instr_notes_dict = defaultdict(list) + for i in range(len(events)-2): + cur_event = events[i] + # print(cur_event) + name = cur_event.split('_')[0] + attr = cur_event.split('_') + if name == 'Bar': + bar_pos += cur_bar_resol + if 'time' in cur_event: + cur_num, cur_denom = attr[-1].split('/') + new_bar_resol = int(default_ticks_per_beat * int(cur_num) * (4 / int(cur_denom))) + cur_bar_resol = new_bar_resol + midi_obj.time_signature_changes.append( + TimeSignature(numerator=int(cur_num), denominator=int(cur_denom), time=bar_pos)) + elif name == 'Beat': + beat_pos = int(attr[1]) + cur_pos = bar_pos + beat_pos * default_in_beat_ticks + elif name == 'Chord': + chord_text = attr[1] + '_' + attr[2] + midi_obj.markers.append(Marker(text=chord_text, time=cur_pos)) + elif name == 'Tempo': + midi_obj.tempo_changes.append( + TempoChange(tempo=int(attr[1]), time=cur_pos)) + elif name == 'Instrument': + cur_instr = int(attr[1]) + else: + if len(self.vocab.feature_list) == 7 or len(self.vocab.feature_list) == 8: + if 'Note_Pitch' in events[i] and \ + 'Note_Duration' in events[i+1] and \ + 'Note_Velocity' in events[i+2]: + pitch = int(events[i].split('_')[-1]) + duration = int(events[i+1].split('_')[-1]) + duration = duration * default_in_beat_ticks + end = cur_pos + duration + velocity = int(events[i+2].split('_')[-1]) + instr_notes_dict[cur_instr].append( + Note( + pitch=pitch, + start=cur_pos, + end=end, + velocity=velocity)) + elif len(self.vocab.feature_list) == 4 or len(self.vocab.feature_list) == 5: + if 'Note_Pitch' in events[i] and \ + 'Note_Duration' in events[i+1]: + pitch = int(events[i].split('_')[-1]) + duration = int(events[i+1].split('_')[-1]) + duration = duration * default_in_beat_ticks + end = cur_pos + duration + velocity = 90 + instr_notes_dict[cur_instr].append( + Note( + pitch=pitch, + start=cur_pos, + end=end, + velocity=velocity)) + + # save midi + for instr, notes in instr_notes_dict.items(): + instrument_name = PROGRAM_INSTRUMENT_MAP[instr] + if instr == 114: is_drum = True + else: is_drum = False + instr_track = Instrument(instr, is_drum=is_drum, name=instrument_name) + instr_track.notes = notes + midi_obj.instruments.append(instr_track) + + if isinstance(output_path, str) or isinstance(output_path, Path): + output_path = str(output_path) + # make subdir + music_path = os.path.join(os.path.dirname(output_path), 'music') + prompt_music_path = os.path.join(os.path.dirname(output_path), 'prompt_music') + if not os.path.exists(music_path): + os.makedirs(music_path) + if not os.path.exists(prompt_music_path): + os.makedirs(prompt_music_path) + # if not contain 'prompt' in output_path, save prompt music + if 'prompt' in output_path: + music_path = os.path.join(prompt_music_path, output_path.split('/')[-1].replace('.mid', '.wav')) + else: + music_path = os.path.join(music_path, output_path.split('/')[-1].replace('.mid', '.wav')) + + midi_obj.dump(output_path) + # save_pianoroll_image_from_midi(output_path, output_path.replace('.mid', '.png')) + save_wav_from_midi_fluidsynth(output_path, music_path, gain=self.gain) + return midi_obj + +class MidiDecoder4CP(MidiDecoder4REMI): + def __init__(self, vocab, in_beat_resolution, dataset_name): + super().__init__(vocab, in_beat_resolution, dataset_name) + + def _update_chord_tempo(self, midi_obj, cur_pos, token_with_7infos, feature2idx): + if len(feature2idx) == 7 or len(feature2idx) == 8: + # chord + if token_with_7infos[feature2idx['chord']] != 'CONTI' and token_with_7infos[feature2idx['chord']] != 0: + midi_obj.markers.append( + Marker(text=str(token_with_7infos[feature2idx['chord']]), time=cur_pos)) + # tempo + if token_with_7infos[feature2idx['tempo']] != 'CONTI' and token_with_7infos[feature2idx['tempo']] != 0 and token_with_7infos[feature2idx['tempo']] != "Tempo_N_N": + tempo = int(token_with_7infos[feature2idx['tempo']].split('_')[-1]) + midi_obj.tempo_changes.append( + TempoChange(tempo=tempo, time=cur_pos)) + return midi_obj + elif len(feature2idx) == 4 or len(feature2idx) == 5: + return midi_obj + + def __call__(self, generated_output, output_path=None): + ''' + generated_output: tensor, batch x seq_len x num_types + num_types includes: type, tempo, chord,'beat, pitch, duration, velocity + ''' + idx2event = self.vocab.idx2event + feature_keys = self.vocab.feature_list + feature2idx = {key: idx for idx, key in enumerate(feature_keys)} + + midi_obj = miditoolkit.midi.parser.MidiFile() + if len(feature2idx) == 4 or len(feature2idx) == 5: + default_tempo = 95 + midi_obj.tempo_changes.append( + TempoChange(tempo=default_tempo, time=0)) + default_ticks_per_beat = 480 + default_in_beat_ticks = 480 // self.in_beat_resolution + cur_pos = 0 + bar_pos = 0 + cur_bar_resol = 0 + beat_pos = 0 + instr_notes_dict = defaultdict(list) + generated_output = generated_output.squeeze(0) + for i in range(len(generated_output)): + token_with_7infos = [] + for kidx, key in enumerate(feature_keys): + token_with_7infos.append(idx2event[key][generated_output[i][kidx].item()]) + # type token + if 'time_signature' in token_with_7infos[feature2idx['type']]: + cur_num, cur_denom = token_with_7infos[feature2idx['type']].split('_')[-1].split('/') + bar_pos += cur_bar_resol + new_bar_resol = int(default_ticks_per_beat * int(cur_num) * (4 / int(cur_denom))) + cur_bar_resol = new_bar_resol + midi_obj.time_signature_changes.append( + TimeSignature(numerator=int(cur_num), denominator=int(cur_denom), time=bar_pos)) + elif token_with_7infos[feature2idx['type']] == 'Metrical': + if 'time_signature' in token_with_7infos[feature2idx['beat']]: + cur_num, cur_denom = token_with_7infos[feature2idx['beat']].split('_')[-1].split('/') + bar_pos += cur_bar_resol + new_bar_resol = int(default_ticks_per_beat * int(cur_num) * (4 / int(cur_denom))) + cur_bar_resol = new_bar_resol + midi_obj.time_signature_changes.append( + TimeSignature(numerator=int(cur_num), denominator=int(cur_denom), time=bar_pos)) + elif token_with_7infos[feature2idx['beat']] == 'Bar': + bar_pos += cur_bar_resol + elif 'Beat' in str(token_with_7infos[feature2idx['beat']]): + beat_pos = int(token_with_7infos[feature2idx['beat']].split('_')[1]) + cur_pos = bar_pos + beat_pos * default_in_beat_ticks # ticks + # chord and tempo + midi_obj = self._update_chord_tempo(midi_obj, cur_pos, token_with_7infos, feature2idx) + elif token_with_7infos[feature2idx['type']] == 'Note': + # instrument token + if len(feature2idx) == 8 or len(feature2idx) == 5: + if token_with_7infos[feature2idx['instrument']] != 0 and token_with_7infos[feature2idx['instrument']] != 'CONTI': + cur_instr = int(token_with_7infos[feature2idx['instrument']].split('_')[-1]) + else: + cur_instr = 0 if not self.dataset_name == 'BachChorale' else 53 + try: + pitch = token_with_7infos[feature2idx['pitch']].split('_')[-1] + duration = token_with_7infos[feature2idx['duration']].split('_')[-1] + duration = int(duration) * default_in_beat_ticks + if len(feature2idx) == 7 or len(feature2idx) == 8: + velocity = token_with_7infos[feature2idx['velocity']].split('_')[-1] + else: + velocity = 80 + end = cur_pos + duration + instr_notes_dict[cur_instr].append( + Note( + pitch=int(pitch), + start=cur_pos, + end=end, + velocity=int(velocity)) + ) + except: + continue + else: # when new bar started without beat + continue + + # save midi + for instr, notes in instr_notes_dict.items(): + instrument_name = PROGRAM_INSTRUMENT_MAP[instr] + if instr == 114: is_drum = True + else: is_drum = False + instr_track = Instrument(instr, is_drum=is_drum, name=instrument_name) + instr_track.notes = notes + midi_obj.instruments.append(instr_track) + + if isinstance(output_path, str) or isinstance(output_path, Path): + output_path = str(output_path) + output_music_dir = os.path.join(os.path.dirname(output_path), 'music') + if not os.path.exists(output_music_dir): + os.makedirs(output_music_dir) + midi_obj.dump(output_path) + save_pianoroll_image_from_midi(output_path, output_path.replace('.mid', '.png')) + save_wav_from_midi_fluidsynth(output_path, output_music_dir.replace('.mid', '.wav'), gain=self.gain) + return midi_obj + +class MidiDecoder4NB(MidiDecoder4REMI): + def __init__(self, vocab, in_beat_resolution, dataset_name): + super().__init__(vocab, in_beat_resolution, dataset_name) + + def _update_additional_info(self, midi_obj, cur_pos, token_with_7infos, feature2idx): + if len(feature2idx) == 7 or len(feature2idx) == 8: + # chord + if token_with_7infos[feature2idx['chord']] != 'CONTI' and token_with_7infos[feature2idx['chord']] != 0 and token_with_7infos[feature2idx['chord']] != 'Chord_N_N': + midi_obj.markers.append( + Marker(text=str(token_with_7infos[feature2idx['chord']]), time=cur_pos)) + # tempo + if token_with_7infos[feature2idx['tempo']] != 'CONTI' and token_with_7infos[feature2idx['tempo']] != 0 and token_with_7infos[feature2idx['tempo']] != "Tempo_N_N": + tempo = int(token_with_7infos[feature2idx['tempo']].split('_')[-1]) + midi_obj.tempo_changes.append( + TempoChange(tempo=tempo, time=cur_pos)) + return midi_obj + elif len(feature2idx) == 4 or len(feature2idx) == 5: + return midi_obj + + def __call__(self, generated_output, output_path=None): + ''' + generated_output: tensor, batch x seq_len x num_types + num_types includes: type, beat, chord, tempo, intrument, pitch, duration, velocity + ''' + idx2event = self.vocab.idx2event + feature_keys = self.vocab.feature_list + feature2idx = {key: idx for idx, key in enumerate(feature_keys)} + + midi_obj = miditoolkit.midi.parser.MidiFile() + if len(feature2idx) == 4 or len(feature2idx) == 5: + default_tempo = 95 + midi_obj.tempo_changes.append( + TempoChange(tempo=default_tempo, time=0)) + default_ticks_per_beat = 480 + default_in_beat_ticks = 480 // self.in_beat_resolution + cur_pos = 0 + bar_pos = 0 + cur_bar_resol = 0 + beat_pos = 0 + instr_notes_dict = defaultdict(list) + generated_output = generated_output.squeeze(0) + for i in range(len(generated_output)): + token_with_7infos = [] + for kidx, key in enumerate(feature_keys): + token_with_7infos.append(idx2event[key][generated_output[i][kidx].item()]) + # type token + if token_with_7infos[feature2idx['type']] == 'Empty_Bar' or token_with_7infos[feature2idx['type']] == 'SNN': + bar_pos += cur_bar_resol + elif 'NNN' in token_with_7infos[feature2idx['type']]: + cur_num, cur_denom = token_with_7infos[feature2idx['type']].split('_')[-1].split('/') + bar_pos += cur_bar_resol + new_bar_resol = int(default_ticks_per_beat * int(cur_num) * (4 / int(cur_denom))) + cur_bar_resol = new_bar_resol + midi_obj.time_signature_changes.append( + TimeSignature(numerator=int(cur_num), denominator=int(cur_denom), time=bar_pos)) + # instrument token + if len(feature2idx) == 8 or len(feature2idx) == 5: + if token_with_7infos[feature2idx['instrument']] != 0 and token_with_7infos[feature2idx['instrument']] != 'CONTI': + cur_instr = int(token_with_7infos[feature2idx['instrument']].split('_')[-1]) + else: + cur_instr = 0 if not self.dataset_name == 'BachChorale' else 53 + if 'Beat' in str(token_with_7infos[feature2idx['beat']]) or 'CONTI' in str(token_with_7infos[feature2idx['beat']]): + if 'Beat' in str(token_with_7infos[feature2idx['beat']]): # when beat is not CONTI beat is updated + beat_pos = int(token_with_7infos[feature2idx['beat']].split('_')[1]) + cur_pos = bar_pos + beat_pos * default_in_beat_ticks # ticks + # update chord and tempo + midi_obj = self._update_additional_info(midi_obj, cur_pos, token_with_7infos, feature2idx) + # note + try: + pitch = token_with_7infos[feature2idx['pitch']].split('_')[-1] + duration = token_with_7infos[feature2idx['duration']].split('_')[-1] # duration between 1~192 + duration = int(duration) * default_in_beat_ticks + if len(feature2idx) == 7 or len(feature2idx) == 8: + velocity = token_with_7infos[feature2idx['velocity']].split('_')[-1] + else: + velocity = 90 + end = cur_pos + duration + instr_notes_dict[cur_instr].append( + Note( + pitch=int(pitch), + start=cur_pos, + end=end, + velocity=int(velocity)) + ) + except: + continue + else: # when new bar started without beat + continue + + # save midi + for instr, notes in instr_notes_dict.items(): + instrument_name = PROGRAM_INSTRUMENT_MAP[instr] + if instr == 114: is_drum = True + else: is_drum = False + instr_track = Instrument(instr, is_drum=is_drum, name=instrument_name) + instr_track.notes = notes + midi_obj.instruments.append(instr_track) + + if isinstance(output_path, str) or isinstance(output_path, Path): + output_path = str(output_path) + music_path = os.path.join(os.path.dirname(output_path), 'music') + prompt_music_path = os.path.join(os.path.dirname(output_path), 'prompt_music') + if not os.path.exists(music_path): + os.makedirs(music_path) + if not os.path.exists(prompt_music_path): + os.makedirs(prompt_music_path) + # if not contain 'prompt' in output_path, save prompt music + if 'prompt' in output_path: + music_path = os.path.join(prompt_music_path, output_path.split('/')[-1].replace('.mid', '.wav')) + else: + music_path = os.path.join(music_path, output_path.split('/')[-1].replace('.mid', '.wav')) + midi_obj.dump(output_path) + # save_pianoroll_image_from_midi(output_path, output_path.replace('.mid', '.png')) + save_wav_from_midi_fluidsynth(output_path, music_path, gain=self.gain) + return midi_obj diff --git a/Amadeus/symbolic_encoding/metric_utils.py b/Amadeus/symbolic_encoding/metric_utils.py new file mode 100644 index 0000000..138c9e0 --- /dev/null +++ b/Amadeus/symbolic_encoding/metric_utils.py @@ -0,0 +1,208 @@ +import torch +import numpy as np + +from collections import Counter + +# TODO: refactor hard coded values +def check_syntax_errors_in_inference_for_nb(generated_output, feature_list): + generated_output = generated_output.squeeze(0) + type_idx = feature_list.index('type') + beat_idx = feature_list.index('beat') + type_beat_list = [] + for token in generated_output: + type_beat_list.append((token[type_idx].item(), token[beat_idx].item())) # type, beat + + last_note = 1 + beat_type_unmatched_error_list = [] + num_unmatched_errors = 0 + beat_backwards_error_list = [] + num_backwards_errors = 0 + for type_beat in type_beat_list: + if type_beat[0] == 4: # same bar, new beat + if type_beat[1] == 0 or type_beat[1] == 1: + num_unmatched_errors += 1 + beat_type_unmatched_error_list.append(type_beat) + if type_beat[1] <= last_note: + num_backwards_errors += 1 + beat_backwards_error_list.append([last_note, type_beat]) + else: + last_note = type_beat[1] # update last note + elif type_beat[0] >= 5: # new bar, new beat + if type_beat[1] == 0: + num_unmatched_errors += 1 + beat_type_unmatched_error_list.append(type_beat) + last_note = 1 + unmatched_error_rate = num_unmatched_errors / len(type_beat_list) + backwards_error_rate = num_backwards_errors / len(type_beat_list) + type_beat_errors_dict = {'beat_type_unmatched_error': unmatched_error_rate, 'beat_backwards_error': backwards_error_rate} + return type_beat_errors_dict + +def check_syntax_errors_in_inference_for_cp(generated_output, feature_list): + generated_output = generated_output.squeeze(0) + type_idx = feature_list.index('type') + beat_idx = feature_list.index('beat') + pitch_idx = feature_list.index('pitch') + duration_idx = feature_list.index('duration') + last_note = 1 + beat_type_unmatched_error_list = [] + num_unmatched_errors = 0 + beat_backwards_error_list = [] + num_backwards_errors = 0 + for token in generated_output: + if token[type_idx].item() == 2: # Metrical + if token[pitch_idx].item() != 0 or token[duration_idx].item() != 0: + num_unmatched_errors += 1 + beat_type_unmatched_error_list.append(token) + if token[beat_idx].item() == 1: # new bar + last_note = 1 # last note will be updated in the next token + elif token[beat_idx].item() != 0 and token[beat_idx].item() <= last_note: + num_backwards_errors += 1 + last_note = token[beat_idx].item() # update last note + beat_backwards_error_list.append([last_note, token]) + else: + last_note = token[beat_idx].item() # update last note + if token[type_idx].item() == 3: # Note + if token[beat_idx].item() != 0: + num_unmatched_errors += 1 + beat_type_unmatched_error_list.append(token) + unmatched_error_rate = num_unmatched_errors / len(generated_output) + backwards_error_rate = num_backwards_errors / len(generated_output) + type_beat_errors_dict = {'beat_type_unmatched_error': unmatched_error_rate, 'beat_backwards_error': backwards_error_rate} + return type_beat_errors_dict + +def check_syntax_errors_in_inference_for_remi(generated_output, vocab): + generated_output = generated_output.squeeze(0) + # to check duration errors + beat_mask = vocab.total_mask['beat'].to(generated_output.device) + beat_mask_for_target = beat_mask[generated_output] + beat_target = generated_output * beat_mask_for_target + bar_mask = vocab.total_mask['type'].to(generated_output.device) + bar_mask_for_target = bar_mask[generated_output] + bar_target = (generated_output+1) * bar_mask_for_target # as bar token in 0 in remi vocab, we add 1 to bar token + target = beat_target + bar_target + target = target[target!=0] + # collect beats in between bars(idx=1) + num_backwards_errors = 0 + collected_beats = [] + total_beats = 0 + for token in target: + if token == 1 or 3 <= token <= 26: # Bar_None, or Bar_time_signature + collected_beats_tensor = torch.tensor(collected_beats) + diff = torch.diff(collected_beats_tensor) + num_error_beats = torch.where(diff<=0)[0].shape[0] + num_backwards_errors += num_error_beats + collected_beats = [] + else: + collected_beats.append(token.item()) + total_beats += 1 + if total_beats != 0: + backwards_error_rate = num_backwards_errors / total_beats + else: + backwards_error_rate = 0 + # print(f"error rate in beat backwards: {backwards_error_rate}") + return {'beat_backwards_error': backwards_error_rate} + +def type_beat_errors_in_validation_nb(beat_prob, answer_type, input_beat, mask): + bool_mask = mask.bool().flatten() # (b*t) + pred_beat_idx = torch.argmax(beat_prob, dim=-1).flatten() # (b*t) + valid_pred_beat_idx = pred_beat_idx[bool_mask] # valid beat_idx + answer_type = answer_type.flatten() # (b*t) + valid_type_input = answer_type[bool_mask] # valid answer_type + type_beat_list = [] + for i in range(len(valid_pred_beat_idx)): + type_beat_list.append((valid_type_input[i].item(), valid_pred_beat_idx[i].item())) # type, beat + input_beat = input_beat.flatten() + valid_input_beat = input_beat[bool_mask] + + last_note = 1 + num_unmatched_errors = 0 + num_backwards_errors = 0 + for type_beat, input_beat_idx in zip(type_beat_list, valid_input_beat): + # update last note + if input_beat_idx.item() >= 1: # beat + last_note = input_beat_idx.item() + if type_beat[0] == 4: # same bar, new beat + if type_beat[1] == 0 or type_beat[1] == 1: + num_unmatched_errors += 1 + if type_beat[1] <= last_note: + num_backwards_errors += 1 + elif type_beat[0] >= 5: # new bar, new beat + if type_beat[1] == 0: + num_unmatched_errors += 1 + return len(type_beat_list), num_unmatched_errors, num_backwards_errors + +def type_beat_errors_in_validation_cp(beat_prob, answer_type, input_beat, mask): + bool_mask = mask.bool().flatten() # (b*t) + beat_idx = torch.argmax(beat_prob, dim=-1).flatten() # (b*t) + valid_beat_idx = beat_idx[bool_mask] # valid beat_idx + answer_type = answer_type.flatten() # (b*t) + valid_type_input = answer_type[bool_mask] # valid answer_type + type_beat_list = [] + for i in range(len(valid_beat_idx)): + type_beat_list.append((valid_type_input[i].item(), valid_beat_idx[i].item())) # type, beat + input_beat = input_beat.flatten() + valid_input_beat = input_beat[bool_mask] + + last_note = 1 + num_unmatched_errors = 0 + num_backwards_errors = 0 + for type_beat, input_beat_idx in zip(type_beat_list, valid_input_beat): + # update last note + if input_beat_idx.item() == 1: # bar + last_note = 1 + elif input_beat_idx.item() >= 2: # new beat + last_note = input_beat_idx.item() + # check errors + if type_beat[0] == 2: # Metrical + if type_beat[1] == 0: # ignore + num_unmatched_errors += 1 + elif type_beat[1] >= 2: # new beat + if type_beat[1] <= last_note: + num_backwards_errors += 1 + elif type_beat[0] == 3: # Note + if type_beat[1] != 0: + num_unmatched_errors += 1 + return len(type_beat_list), num_unmatched_errors, num_backwards_errors + +def get_beat_difference_metric(prob_dict, arranged_prob_dict, mask): + orign_beat_prob = prob_dict['beat'] # b x t x vocab_size + arranged_beat_prob = arranged_prob_dict['beat'] # b x t x vocab_size + + # calculate similarity between original beat prob and arranged beat prob + origin_beat_token = torch.argmax(orign_beat_prob, dim=-1) * mask # b x t + arranged_beat_token = torch.argmax(arranged_beat_prob, dim=-1) * mask # b x t + num_same_beat = torch.sum(origin_beat_token == arranged_beat_token) - torch.sum(mask==0) + num_beat = torch.sum(mask==1) + beat_sim = (num_same_beat / num_beat).item() # scalar + + # apply mask, shape of mask: b x t + orign_beat_prob = orign_beat_prob * mask.unsqueeze(-1) # b x t x vocab_size + arranged_beat_prob = arranged_beat_prob * mask.unsqueeze(-1) + + # calculate cosine similarity between original beat prob and arranged beat prob + orign_beat_prob = orign_beat_prob.flatten(0,1) # (b*t) x vocab_size + arranged_beat_prob = arranged_beat_prob.flatten(0,1) # (b*t) x vocab_size + cos = torch.nn.CosineSimilarity(dim=-1, eps=1e-6) + beat_cos_sim = cos(orign_beat_prob, arranged_beat_prob) # (b*t) + # exclude invalid tokens, zero padding tokens + beat_cos_sim = beat_cos_sim[mask.flatten().bool()] # num_valid_tokens + beat_cos_sim = torch.mean(beat_cos_sim).item() # scalar + return {'beat_cos_sim': beat_cos_sim, 'beat_sim': beat_sim} + +def get_gini_coefficient(generated_output): + if len(generated_output.shape) == 3: + generated_output = generated_output.squeeze(0).tolist() + gen_list = [tuple(x) for x in generated_output] + else: + gen_list = generated_output.squeeze(0).tolist() + counts = Counter(gen_list).values() + sorted_counts = sorted(counts) + n = len(sorted_counts) + cumulative_counts = np.cumsum(sorted_counts) + cumulative_proportion = cumulative_counts / cumulative_counts[-1] + + lorenz_area = sum(cumulative_proportion[:-1]) / n # Exclude the last element + equality_area = 0.5 # The area under line of perfect equality + + gini = (equality_area - lorenz_area) / equality_area + return gini \ No newline at end of file diff --git a/Amadeus/symbolic_encoding/midi2audio.py b/Amadeus/symbolic_encoding/midi2audio.py new file mode 100644 index 0000000..ddbae0f --- /dev/null +++ b/Amadeus/symbolic_encoding/midi2audio.py @@ -0,0 +1,78 @@ +import argparse +import os +import subprocess +from pydub import AudioSegment + +''' +This file is a modified version of midi2audio.py from https://github.com/bzamecnik/midi2audio +Author: Bohumír Zámečník (@bzamecnik) +License: MIT, see the LICENSE file +''' + +__all__ = ['FluidSynth'] + +DEFAULT_SOUND_FONT = '/data2/suhongju/research/music-generation/sound_file/CrisisGeneralMidi3.01.sf2' +DEFAULT_SAMPLE_RATE = 48000 +DEFAULT_GAIN = 0.05 +# DEFAULT_SOUND_FONT = "/data2/suhongju/research/music-generation/sound_file/Advent GM 7.sf2" +# DEFAULT_SOUND_FONT = '~/.fluidsynth/default_sound_font.sf2' +# DEFAULT_SAMPLE_RATE = 16000 +# DEFAULT_GAIN = 0.20 + +class FluidSynth(): + def __init__(self, sound_font=DEFAULT_SOUND_FONT, sample_rate=DEFAULT_SAMPLE_RATE, gain=DEFAULT_GAIN): + self.sample_rate = sample_rate + self.sound_font = os.path.expanduser(sound_font) + self.gain = gain + + def midi_to_audio(self, midi_file: str, audio_file: str, verbose=True): + if verbose: + stdout = None + else: + stdout = subprocess.DEVNULL + + # Convert MIDI to WAV + subprocess.call( + ['fluidsynth', '-ni', '-g', str(self.gain), self.sound_font, midi_file, '-F', audio_file, '-r', str(self.sample_rate)], + stdout=stdout + ) + + # Convert WAV to MP3 + # mp3_path = audio_file.replace('.wav', '.mp3') + # AudioSegment.from_wav(audio_file).export(mp3_path, format="mp3") + + # # Delete the temporary WAV file + # os.remove(audio_file) + + def play_midi(self, midi_file): + subprocess.call(['fluidsynth', '-i', '-g', str(self.gain), self.sound_font, midi_file, '-r', str(self.sample_rate)]) + +def parse_args(allow_synth=True): + parser = argparse.ArgumentParser(description='Convert MIDI to audio via FluidSynth') + parser.add_argument('midi_file', metavar='MIDI', type=str) + if allow_synth: + parser.add_argument('audio_file', metavar='AUDIO', type=str, nargs='?') + parser.add_argument('-s', '--sound-font', type=str, + default=DEFAULT_SOUND_FONT, + help='path to a SF2 sound font (default: %s)' % DEFAULT_SOUND_FONT) + parser.add_argument('-r', '--sample-rate', type=int, nargs='?', + default=DEFAULT_SAMPLE_RATE, + help='sample rate in Hz (default: %s)' % DEFAULT_SAMPLE_RATE) + return parser.parse_args() + +def main(allow_synth=True): + args = parse_args(allow_synth) + fs = FluidSynth(args.sound_font, args.sample_rate) + if allow_synth and args.audio_file: + fs.midi_to_audio(args.midi_file, args.audio_file) + else: + fs.play_midi(args.midi_file) + +def main_play(): + """ + A method for the `midiplay` entry point. It omits the audio file from args. + """ + main(allow_synth=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/config-accelerate.yaml b/Amadeus/symbolic_yamls/config-accelerate.yaml new file mode 100644 index 0000000..0769a14 --- /dev/null +++ b/Amadeus/symbolic_yamls/config-accelerate.yaml @@ -0,0 +1,65 @@ +defaults: + # - nn_params: nb8_embSum_NMT + # - nn_params: remi8 + - nn_params: nb8_embSum_diff_t2m_150M_finetunning + # - nn_params: nb8_embSum_diff_t2m_150M_pretraining + # - nn_params: nb8_embSum_subPararell + # - nn_params: nb8_embSum_diff_t2m_150M + + # - nn_params: nb8_embSum_subFeedForward + # - nn_params: nb8_embSum_diff + # nn_params: nb8_SA_diff + # - nn_params: nb8_embSum_diff_main12head16dim512_ave + # - nn_params: nb8_embSum_NMT_main12_head_16_dim512 + # - nn_params: remi8_main12_head_16_dim512 + # - nn_params: nb5_embSum_diff_main12head16dim768_sub3 + +dataset: FinetuneDataset # Pop1k7, Pop909, SOD, LakhClean,PretrainingDataset FinetuneDataset +captions_path: dataset/midicaps/train_set.json + +# dataset: SymphonyNet_Dataset # Pop1k7, Pop909, SOD, LakhClean +# captions_path: dataset/symphonyNet/syd-caption.json + +use_ddp: True # True, False | distributed data parallel +use_fp16: True # True, False | mixed precision training +use_diff: True # True,use diffusion in subdecoder +diff_steps: 8 # number of diffusion steps +use_dispLoss: True +lambda_weight: 0.5 +tau: 0.5 + +train_params: + device: cuda + batch_size: 3 + grad_clip: 1.0 + num_iter: 300000 # total number of iterations + num_cycles_for_inference: 10 # number of cycles for inference, iterations_per_validation_cycle * num_cycles_for_inference + num_cycles_for_model_checkpoint: 1 # number of cycles for model checkpoint, iterations_per_validation_cycle * num_cycles_for_model_checkpoint + iterations_per_training_cycle: 10 # number of iterations for logging training loss + iterations_per_validation_cycle: 5000 # number of iterations for validation process + input_length: 3072 # input sequence length3072 + # you can use focal loss, it it's not used, set focal_gamma to 0 + focal_alpha: 1 + focal_gamma: 0 + # learning rate scheduler: 'cosinelr', 'cosineannealingwarmuprestarts', 'not-using', please check train_utils.py for more details + scheduler : cosinelr + initial_lr: 0.00005 + decay_step_rate: 0.8 # means it will reach its lowest point at decay_step_rate * total_num_iter + num_steps_per_cycle: 20000 # number of steps per cycle for 'cosineannealingwarmuprestarts' + warmup_steps: 2000 #number of warmup steps + max_lr: 0.00015 + gamma: 0.6 # the decay rate for 'cosineannealingwarmuprestarts' + # Distributed Data Parallel + world_size: 5 # 0 means no distributed training + gradient_accumulation_steps: 4 # 1 means no gradient accumulation +inference_params: + num_uncond_generation: 1 # number of unconditional generation + num_cond_generation: 3 # number of conditional generation +data_params: + first_pred_feature: pitch # compound shifting for NB only, choose the target sub-token (remi and cp are not influenced by this argument) + split_ratio: 0.998 # train-validation-test split ratio + aug_type: pitch # random, null | pitch and chord augmentation type +general: + debug: False + make_log: True # True, False | update the log file in wandb online to your designated project and entity + infer_and_log: True # True, False | inference and log the results \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/config.yaml b/Amadeus/symbolic_yamls/config.yaml new file mode 100644 index 0000000..9081dbc --- /dev/null +++ b/Amadeus/symbolic_yamls/config.yaml @@ -0,0 +1,54 @@ +defaults: + # - nn_params: nb8_embSum_NMT + # - nn_params: remi8 + # - nn_params: nb8_embSum_diff + - nn_params: nb8_embSum_subFeedForward + # - nn_params: nb8_SA_diff + # - nn_params: nb8_embSum_diff_main12head16dim512_ave + # - nn_params: nb8_embSum_NMT_main12_head_16_dim512 + # - nn_params: remi8_main12_head_16_dim512 + # - nn_params: nb5_embSum_diff_main12head16dim768_sub3 + +dataset: LakhClean # Pop1k7, Pop909, SOD, LakhClean +use_ddp: True # True, False | distributed data parallel +use_fp16: True # True, False | mixed precision training +use_diff: True # True,use diffusion in subdecoder +use_dispLoss: True +lambda_weight: 0.5 +tau: 0.5 +diff_steps: 8 # number of diffusion steps +train_params: + device: cuda + batch_size: 8 + grad_clip: 1.0 + num_iter: 25000 # total number of iterations + num_cycles_for_inference: 10 # number of cycles for inference, iterations_per_validation_cycle * num_cycles_for_inference + num_cycles_for_model_checkpoint: 10 # number of cycles for model checkpoint, iterations_per_validation_cycle * num_cycles_for_model_checkpoint + iterations_per_training_cycle: 10 # number of iterations for logging training loss + iterations_per_validation_cycle: 500 # number of iterations for validation process + input_length: 3072 # input sequence length3072 + # you can use focal loss, it it's not used, set focal_gamma to 0 + focal_alpha: 1 + focal_gamma: 0 + # learning rate scheduler: 'cosinelr', 'cosineannealingwarmuprestarts', 'not-using', please check train_utils.py for more details + scheduler : cosinelr + initial_lr: 0.0001 + decay_step_rate: 0.4 # means it will reach its lowest point at decay_step_rate * total_num_iter + num_steps_per_cycle: 20000 # number of steps per cycle for 'cosineannealingwarmuprestarts' + warmup_steps: 2000 # number of warmup steps + max_lr: 0.00015 + gamma: 0.6 # the decay rate for 'cosineannealingwarmuprestarts' + # Distributed Data Parallel + world_size: 5 # 0 means no distributed training + gradient_accumulation_steps: 1 # 1 means no gradient accumulation +inference_params: + num_uncond_generation: 1 # number of unconditional generation + num_cond_generation: 3 # number of conditional generation +data_params: + first_pred_feature: pitch # compound shifting for NB only, choose the target sub-token (remi and cp are not influenced by this argument) + split_ratio: 0.99 # train-validation-test split ratio + aug_type: null # random, null | pitch and chord augmentation type +general: + debug: False + make_log: True # True, False | update the log file in wandb online to your designated project and entity + infer_and_log: True # True, False | inference and log the results \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/cp5_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_NMT.yaml new file mode 100644 index 0000000..88b6ef7 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_NMT.yaml @@ -0,0 +1,20 @@ +encoding_scheme: cp +num_features: 5 +vocab_name: MusicTokenVocabCP +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + input_length: 1024 + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subCrossAttention.yaml b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subCrossAttention.yaml new file mode 100644 index 0000000..2f185d7 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subCrossAttention.yaml @@ -0,0 +1,20 @@ +encoding_scheme: cp +num_features: 5 +vocab_name: MusicTokenVocabCP +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + input_length: 1024 + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward.yaml new file mode 100644 index 0000000..dae9889 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward.yaml @@ -0,0 +1,18 @@ +encoding_scheme: cp +num_features: 5 +vocab_name: MusicTokenVocabCP +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: FeedForward +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward_original.yaml b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward_original.yaml new file mode 100644 index 0000000..97f2a75 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/cp5_embSum_subFeedForward_original.yaml @@ -0,0 +1,19 @@ +encoding_scheme: cp +num_features: 5 +vocab_name: MusicTokenVocabCP +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: FeedForward +model_dropout: 0.1 +partial_sequential_prediction: True +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/cp7_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_NMT.yaml new file mode 100644 index 0000000..93ee60f --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_NMT.yaml @@ -0,0 +1,19 @@ +encoding_scheme: cp +num_features: 7 +vocab_name: MusicTokenVocabCP +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subCrossAttention.yaml b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subCrossAttention.yaml new file mode 100644 index 0000000..fbcda81 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subCrossAttention.yaml @@ -0,0 +1,19 @@ +encoding_scheme: cp +num_features: 7 +vocab_name: MusicTokenVocabCP +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward.yaml new file mode 100644 index 0000000..e991ad3 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward.yaml @@ -0,0 +1,18 @@ +encoding_scheme: cp +num_features: 7 +vocab_name: MusicTokenVocabCP +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: FeedForward +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward_original.yaml b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward_original.yaml new file mode 100644 index 0000000..848b0a2 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/cp7_embSum_subFeedForward_original.yaml @@ -0,0 +1,20 @@ +encoding_scheme: cp +num_features: 7 +vocab_name: MusicTokenVocabCP +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: FeedForward +model_dropout: 0.1 +partial_sequential_prediction: True +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + input_length: 1024 + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_NMT.yaml new file mode 100644 index 0000000..1d58b6d --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_NMT.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff.yaml new file mode 100644 index 0000000..8602ebc --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim512_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim512_sub3.yaml new file mode 100644 index 0000000..f1f71e0 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim512_sub3.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 12 + num_head: 16 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 3 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim768_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim768_sub3.yaml new file mode 100644 index 0000000..26c281b --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_diff_main12head16dim768_sub3.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 768 + num_layer: 12 + num_head: 16 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 3 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subCrossAttention.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subCrossAttention.yaml new file mode 100644 index 0000000..26a65e6 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subCrossAttention.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subFeedForward.yaml new file mode 100644 index 0000000..44f38fa --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subFeedForward.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: FeedForward +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subPararell.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subPararell.yaml new file mode 100644 index 0000000..4022e9f --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subPararell.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: Parallel +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subRNN.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subRNN.yaml new file mode 100644 index 0000000..db7611a --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subRNN.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: RNN +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subSelfAttention.yaml b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subSelfAttention.yaml new file mode 100644 index 0000000..fb276a4 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb5_embSum_subSelfAttention.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 5 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: SelfAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_NMT.yaml new file mode 100644 index 0000000..f25a42c --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_NMT.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 7 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subCrossAttention.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subCrossAttention.yaml new file mode 100644 index 0000000..5359592 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subCrossAttention.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 7 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subFeedForward.yaml new file mode 100644 index 0000000..0103bea --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subFeedForward.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 7 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: FeedForward +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subPararell.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subPararell.yaml new file mode 100644 index 0000000..161144e --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subPararell.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 7 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: Parallel +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subRNN.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subRNN.yaml new file mode 100644 index 0000000..4db5550 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subRNN.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 7 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: RNN +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subSelfAttention.yaml b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subSelfAttention.yaml new file mode 100644 index 0000000..71571a3 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb7_embSum_subSelfAttention.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 7 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: SelfAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_SA_diff.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_SA_diff.yaml new file mode 100644 index 0000000..2a3864e --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_SA_diff.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SelfAttentionEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT.yaml new file mode 100644 index 0000000..123ba16 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512.yaml new file mode 100644 index 0000000..ace6af3 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 12 + num_head: 16 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512_sub3.yaml new file mode 100644 index 0000000..412409d --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMT_main12_head_16_dim512_sub3.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 12 + num_head: 16 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 3 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMTsub6.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMTsub6.yaml new file mode 100644 index 0000000..3510d68 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_NMTsub6.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: CrossAttention +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 6 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff.yaml new file mode 100644 index 0000000..4145a00 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_150M.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_150M.yaml new file mode 100644 index 0000000..510a725 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_150M.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.2 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 768 + num_layer: 16 + num_head: 12 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512.yaml new file mode 100644 index 0000000..cd72a8c --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512.yaml @@ -0,0 +1,20 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 12 + num_head: 16 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True + \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_ave.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_ave.yaml new file mode 100644 index 0000000..4d193a5 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_ave.yaml @@ -0,0 +1,20 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: AverageEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 12 + num_head: 16 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: True + \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_sub3.yaml new file mode 100644 index 0000000..37bf321 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_main12head16dim512_sub3.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 12 + num_head: 16 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 3 + feature_enricher_use: True diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub3.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub3.yaml new file mode 100644 index 0000000..6ec2c3d --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub3.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 2 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub6.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub6.yaml new file mode 100644 index 0000000..9b7411e --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_sub6.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 6 + feature_enricher_use: True \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M.yaml new file mode 100644 index 0000000..8c59625 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerCrossAttendDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.2 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 768 + num_layer: 16 + num_head: 12 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_finetunning.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_finetunning.yaml new file mode 100644 index 0000000..71e42f5 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_finetunning.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerFinetuningDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.2 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 768 + num_layer: 20 + num_head: 12 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_prefix.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_prefix.yaml new file mode 100644 index 0000000..9aa36a5 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_prefix.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerPrefixDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 768 + num_layer: 16 + num_head: 12 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_pretraining.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_pretraining.yaml new file mode 100644 index 0000000..8ffffdd --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_150M_pretraining.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerPretrainingDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 768 + num_layer: 20 + num_head: 12 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_30M.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_30M.yaml new file mode 100644 index 0000000..6da49a3 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_diff_t2m_30M.yaml @@ -0,0 +1,19 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerCrossAttendDecoder +sub_decoder_name: DiffusionDecoder +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 + feature_enricher_use: False \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subFeedForward.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subFeedForward.yaml new file mode 100644 index 0000000..473839d --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subFeedForward.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: FeedForward +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subPararell.yaml b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subPararell.yaml new file mode 100644 index 0000000..7d98ce6 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/nb8_embSum_subPararell.yaml @@ -0,0 +1,18 @@ +encoding_scheme: nb +num_features: 8 +vocab_name: MusicTokenVocabNB +model_name: NestedMusicTransformer +input_embedder_name: SummationEmbedder +main_decoder_name: XtransformerDecoder +sub_decoder_name: Parallel +model_dropout: 0.1 +input_embedder: + num_layer: 1 + num_head: 8 +main_decoder: + dim_model: 512 + num_layer: 6 + num_head: 8 +sub_decoder: + decout_window_size: 1 # 1 means no previous decoding output added + num_layer: 1 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/remi5.yaml b/Amadeus/symbolic_yamls/nn_params/remi5.yaml new file mode 100644 index 0000000..dba0f34 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/remi5.yaml @@ -0,0 +1,12 @@ +encoding_scheme: remi +num_features: 5 +vocab_name: LangTokenVocab +model_name: NestedMusicTransformer +input_embedder_name: SingleEmbedding +main_decoder_name: XtransformerDecoder +sub_decoder_name: SingleProjection +model_dropout: 0.1 +main_decoder: + dim_model: 512 + num_layer: 8 + num_head: 8 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/remi7.yaml b/Amadeus/symbolic_yamls/nn_params/remi7.yaml new file mode 100644 index 0000000..b9b7768 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/remi7.yaml @@ -0,0 +1,12 @@ +encoding_scheme: remi +num_features: 7 +vocab_name: LangTokenVocab +model_name: NestedMusicTransformer +input_embedder_name: SingleEmbedding +main_decoder_name: XtransformerDecoder +sub_decoder_name: SingleProjection +model_dropout: 0.1 +main_decoder: + dim_model: 512 + num_layer: 8 + num_head: 8 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/remi8.yaml b/Amadeus/symbolic_yamls/nn_params/remi8.yaml new file mode 100644 index 0000000..20e2948 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/remi8.yaml @@ -0,0 +1,12 @@ +encoding_scheme: remi +num_features: 8 +vocab_name: LangTokenVocab +model_name: NestedMusicTransformer +input_embedder_name: SingleEmbedding +main_decoder_name: XtransformerDecoder +sub_decoder_name: SingleProjection +model_dropout: 0.1 +main_decoder: + dim_model: 512 + num_layer: 8 + num_head: 8 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/nn_params/remi8_main12_head_16_dim512.yaml b/Amadeus/symbolic_yamls/nn_params/remi8_main12_head_16_dim512.yaml new file mode 100644 index 0000000..da50eb2 --- /dev/null +++ b/Amadeus/symbolic_yamls/nn_params/remi8_main12_head_16_dim512.yaml @@ -0,0 +1,12 @@ +encoding_scheme: remi +num_features: 8 +vocab_name: LangTokenVocab +model_name: NestedMusicTransformer +input_embedder_name: SingleEmbedding +main_decoder_name: XtransformerDecoder +sub_decoder_name: SingleProjection +model_dropout: 0.1 +main_decoder: + dim_model: 512 + num_layer: 12 + num_head: 16 \ No newline at end of file diff --git a/Amadeus/symbolic_yamls/symbolic_sweep.yaml b/Amadeus/symbolic_yamls/symbolic_sweep.yaml new file mode 100644 index 0000000..166bd7d --- /dev/null +++ b/Amadeus/symbolic_yamls/symbolic_sweep.yaml @@ -0,0 +1,17 @@ +program: train.py +method: grid +metric: + name: valid.total + goal: minimize +parameters: + train_params.batch_size: + values: [8] + train_params.focal_gamma: + values: [0, 1] + nn_params.main_decoder.input_length: + values: [8192] + +command: + - python3 + - ${program} + - ${args_no_hyphens} \ No newline at end of file diff --git a/Amadeus/train_utils.py b/Amadeus/train_utils.py new file mode 100644 index 0000000..a98ce59 --- /dev/null +++ b/Amadeus/train_utils.py @@ -0,0 +1,428 @@ +import math + +from numpy import mask_indices +import torch +import torch.nn as nn +from torch.optim.lr_scheduler import _LRScheduler +from torch.optim import Optimizer +from collections import defaultdict +import torch.nn.functional as F + +def add_conti_for_single_feature(tensor): + new_target = tensor.clone() + # Assuming tensor shape is [batch, sequence, features] + # Create a shifted version of the tensor + shifted_tensor = torch.roll(new_target, shifts=1, dims=1) + # The first element of each sequence cannot be a duplicate by definition + shifted_tensor[:, 0] = new_target[:, 0] + 1 + + # Identify where the original and shifted tensors are the same (duplicates) + duplicates = new_target == shifted_tensor + # Replace duplicates with 9999 + new_target[duplicates] = 9999 + return new_target + +def adjust_prediction_order(encoding_scheme, num_features, target_feature, nn_params): + feature_prediction_order_dict = { + 4: ["type", "beat", "pitch", "duration"], + 5: ["type", "beat", "instrument", "pitch", "duration"], + 7: ["type", "beat", "chord", "tempo", "pitch", "duration", "velocity"], + 8: ["type", "beat", "chord", "tempo", "instrument", "pitch", "duration", "velocity"] + } + + if encoding_scheme == 'remi': + prediction_order = feature_prediction_order_dict[num_features] + elif encoding_scheme == 'cp': + if nn_params.get("partial_sequential_prediction", False): + default_prediction_order = feature_prediction_order_dict[num_features] + prediction_order = [default_prediction_order[0], default_prediction_order[1:]] + else: + prediction_order = feature_prediction_order_dict[num_features] + elif encoding_scheme == 'nb': + assert target_feature in feature_prediction_order_dict[num_features], f"Target feature {target_feature} not in the selected sub-token set. Please check target feature in the config and num_features in nn_params." + default_prediction_order = feature_prediction_order_dict[num_features] + + # Reorganize the prediction order based on the target_feature + target_index = default_prediction_order.index(target_feature) + prediction_order = default_prediction_order[target_index:] + default_prediction_order[:target_index] + + return prediction_order + +########################### Loss function ################################ + +class NLLLoss4REMI(): + def __init__( + self, + focal_alpha:float, + focal_gamma:float, + ): + self.alpha = focal_alpha + self.gamma = focal_gamma + + def get_nll_loss(self, logits, target, mask): + probs = logits.softmax(dim=-1) + if probs.ndim == 3: + probs = probs.flatten(0, 1) # [batch_size*seq_len x vocab_size] + if target.ndim == 2: + target = target.flatten(0, 1) # [batch_size*seq_len] + # clamp min value to 1e-7 to avoid log(0) + pt = probs[torch.arange(len(target)), target].clamp(1e-7, 1-1e-7) # [batch_size*seq_len] + loss = -self.alpha * (1-pt)**self.gamma * torch.log(pt) # [batch_size*seq_len] + loss_seq = loss * mask.flatten(0, 1) # [batch_size*seq_len] + loss = loss_seq.sum() / mask.sum() # calculating mean loss considering mask + return loss, loss_seq + + def __call__(self, logits, shifted_tgt, mask, vocab): + if vocab is not None: + loss, loss_seq = self.get_nll_loss(logits, shifted_tgt, mask) + loss_by_class_normal = defaultdict(float) + shifted_tgt_with_mask = shifted_tgt * mask # [b, t] + answers_idx = shifted_tgt_with_mask.flatten(0,1) # [b*t] + for feature in vocab.feature_list: + feature_mask = vocab.total_mask[feature].to(answers_idx.device) # [327,] + mask_for_target = feature_mask[answers_idx] # [b*t] + normal_loss_seq_by_class = loss_seq * mask_for_target + if mask_for_target.sum().item() != 0: + loss_by_class_normal[feature+'_normal'] += (normal_loss_seq_by_class.sum().item() / mask_for_target.sum().item()) + return loss, loss_by_class_normal + else: + loss, loss_seq = self.get_nll_loss(logits, shifted_tgt, mask) + return loss, None + +class NLLLoss4CompoundToken(): + def __init__(self, feature_list, focal_alpha:float, focal_gamma:float): + self.feature_list = feature_list + self.alpha = focal_alpha + self.gamma = focal_gamma + + def get_nll_loss(self, logits, target, mask): + probs = logits.softmax(dim=-1) + if probs.ndim == 3: + probs = probs.flatten(0, 1) # [batch_size*seq_len x vocab_size] + if target.ndim == 2: + target = target.flatten(0, 1) # [batch_size*seq_len] + # clamp min value to 1e-7 to avoid log(0) + pt = probs[torch.arange(len(target)), target].clamp(1e-7, 1-1e-7) # [batch_size*seq_len] + loss = -self.alpha * (1-pt)**self.gamma * torch.log(pt) # [batch_size*seq_len] + loss = loss * mask.flatten(0, 1) # [batch_size*seq_len] + loss = loss.sum() / mask.sum() # calculating mean loss considering mask + return loss + + def get_nll_loss_for_logging(self, logits, target, mask, ignore_token, conti_token): + probs = logits.softmax(dim=-1) + + if ignore_token is not None and conti_token is not None: + target_conti = add_conti_for_single_feature(target) # [batch_size*seq_len] + valid_mask = (target_conti != ignore_token) & (target_conti != conti_token) # [batch_size*seq_len] + elif ignore_token is not None and conti_token is None: + valid_mask = (target != ignore_token) + elif ignore_token is None and conti_token is None: + valid_mask = torch.ones_like(target).bool() + valid_mask = valid_mask.flatten(0, 1) + + if probs.ndim == 3: + probs = probs.flatten(0, 1) # [batch_size*seq_len x vocab_size] + if target.ndim == 2: + target = target.flatten(0, 1) # [batch_size*seq_len] + pt = probs[torch.arange(len(target)), target] # [batch_size*seq_len] + total_mask = mask.flatten(0, 1) & valid_mask # [batch_size*seq_len] + loss = -self.alpha * (1-pt)**self.gamma * torch.log(pt) # [batch_size*seq_len] + loss = loss * total_mask # [batch_size*seq_len] + loss = loss.sum() / total_mask.sum() # calculating mean loss considering mask + return loss + + def __call__(self, logits_dict, shifted_tgt, mask, valid): + train_loss_list = [] + log_loss_dict_normal = {} + for idx, key in enumerate(self.feature_list): + training_loss = self.get_nll_loss(logits_dict[key], shifted_tgt[..., idx], mask) + train_loss_list.append(training_loss) + if valid: + if key == 'type': + log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=None, conti_token=None) + elif key == 'beat': + log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=9999) + elif key == 'chord' or key == 'tempo' or key == 'instrument': + log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=9999) + else: + log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=None) + k_normal = key + '_normal' + log_loss_dict_normal[k_normal] = log_normal_loss + total_loss = sum(train_loss_list) / len(train_loss_list) + if valid: + return total_loss, log_loss_dict_normal + else: + return total_loss, None + +def dispersive_loss(z, tau=0.5, eps=1e-8): + """使用余弦距离的Dispersive Loss实现""" + B = z.size(0) + + # 计算余弦相似度矩阵 [B, B] + z_norm = torch.nn.functional.normalize(z, p=2, dim=1) # 向量归一化 + sim_matrix = torch.matmul(z_norm, z_norm.transpose(0, 1)) # 余弦相似度 + + # 转换为余弦距离 (1 - 相似度),排除对角线 + mask = 1 - torch.eye(B, device=z.device) + cos_dist = (1 - sim_matrix) * mask + + # 计算分散性损失(与L2版本相同) + exp_term = torch.exp(-cos_dist / tau) + mean_exp = exp_term.sum() / (B * (B - 1) + eps) + loss = -torch.log(mean_exp + eps) + return loss +class DiffusionLoss4CompoundToken(): + def __init__(self, feature_list, focal_alpha:float, focal_gamma:float): + self.feature_list = feature_list + self.alpha = focal_alpha + self.gamma = focal_gamma + + def get_nll_loss(self, logits, target, mask,mask_indices, p_mask): + if logits.ndim == 3: + logits = logits.flatten(0, 1) # [batch_size*seq_len x vocab_size] + if target.ndim == 2: + target = target.flatten(0, 1) # [batch_size*seq_len] + if mask_indices.ndim == 2: + mask_indices = mask_indices.flatten(0, 1) + if p_mask.ndim == 2: + p_mask = p_mask.flatten(0, 1) + if mask.ndim == 2: + mask = mask.flatten(0, 1) + # datatype of logits, target, mask_indices, p_mask should be the same + token_loss = F.cross_entropy( + logits[mask_indices], # 直接索引 logits + target[mask_indices], + reduction='none' + ) / p_mask[mask_indices] + loss = (token_loss * mask[mask_indices]).sum() / mask[mask_indices].sum() + return loss + + def get_nll_loss_for_logging(self, logits, target, mask, ignore_token, conti_token, mask_indices, p_mask): + if ignore_token is not None and conti_token is not None: + target_conti = add_conti_for_single_feature(target) # [batch_size*seq_len] + valid_mask = (target_conti != ignore_token) & (target_conti != conti_token) # [batch_size*seq_len] + elif ignore_token is not None and conti_token is None: + valid_mask = (target != ignore_token) + elif ignore_token is None and conti_token is None: + valid_mask = torch.ones_like(target).bool() + valid_mask = valid_mask.flatten(0, 1) + + if logits.ndim == 3: + logits = logits.flatten(0, 1) # [batch_size*seq_len x vocab_size] + if target.ndim == 2: + target = target.flatten(0, 1) # [batch_size*seq_len] + if mask_indices.ndim == 2: + mask_indices = mask_indices.flatten(0, 1) + if p_mask.ndim == 2: + p_mask = p_mask.flatten(0, 1) + token_loss = F.cross_entropy( + logits[mask_indices], # 直接索引 logits + target[mask_indices], + reduction='none' + ) / p_mask[mask_indices] + total_mask = mask.flatten(0, 1) & valid_mask # [batch_size*seq_len] + loss = (token_loss * total_mask[mask_indices]).sum() / total_mask[mask_indices].sum() + + return loss + + def __call__(self, logits_dict, shifted_tgt, mask, mask_indices, p_mask, valid, input_dict=None,lambda_weight=0.5, tau=0.5): + train_loss_list = [] + log_loss_dict_normal = {} + mask_indices = mask_indices.reshape(shifted_tgt.shape[0], shifted_tgt.shape[1], -1) + p_mask = p_mask.reshape(shifted_tgt.shape[0], shifted_tgt.shape[1], -1) + disp_loss = None + if input_dict is not None: + hidden_vec =input_dict['hidden_vec'] #bs,seq_len,dim + feat = hidden_vec.mean(dim=1) #bs,dim + disp_loss = dispersive_loss(feat, tau=tau) # scalar + for idx, key in enumerate(self.feature_list): + training_loss = self.get_nll_loss(logits_dict[key], shifted_tgt[..., idx], mask, mask_indices[..., idx], p_mask[..., idx]) + train_loss_list.append(training_loss) + if valid: + if key == 'type': + log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=None, conti_token=None, mask_indices=mask_indices[..., idx], p_mask=p_mask[..., idx]) + elif key == 'beat': + log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=9999, mask_indices=mask_indices[..., idx], p_mask=p_mask[..., idx]) + elif key == 'chord' or key == 'tempo' or key == 'instrument': + log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=9999, mask_indices=mask_indices[..., idx], p_mask=p_mask[..., idx]) + else: + log_normal_loss = self.get_nll_loss_for_logging(logits_dict[key], shifted_tgt[..., idx], mask, ignore_token=0, conti_token=None, mask_indices=mask_indices[..., idx], p_mask=p_mask[..., idx]) + k_normal = key + '_normal' + log_loss_dict_normal[k_normal] = log_normal_loss + total_loss = sum(train_loss_list) / len(train_loss_list) + if disp_loss is not None: + total_loss = total_loss + lambda_weight * disp_loss + log_loss_dict_normal['dispersion'] = disp_loss.item() + if valid: + return total_loss, log_loss_dict_normal + else: + return total_loss, None + +class EncodecFlattenLoss(): + def __init__(self, feature_list): + self.feature_list = feature_list + + def get_nll_loss(self, logits, target, mask): + probs = logits.softmax(dim=-1) + if probs.ndim == 3: + probs = probs.flatten(0, 1) # [batch_size*seq_len x vocab_size] + if target.ndim == 2: + target = target.flatten(0, 1) # [batch_size*seq_len] + pt = probs[torch.arange(len(target)), target].clamp(1e-7, 1-1e-7) # [batch_size*seq_len] + loss_seq = -torch.log(pt) # [batch_size*seq_len] + loss_seq = loss_seq * mask.flatten(0, 1) # [batch_size*seq_len] + loss = loss_seq.sum() / mask.sum() # calculating mean loss considering mask + return loss + + def __call__(self, logits, shifted_tgt, mask): + loss = self.get_nll_loss(logits, shifted_tgt, mask) + return loss + +class EncodecMultiClassLoss(EncodecFlattenLoss): + def __init__(self, feature_list): + super().__init__(feature_list) + + def __call__(self, logits_dict, shifted_tgt, mask): + train_loss_list = [] + for idx, key in enumerate(self.feature_list): + training_loss = self.get_nll_loss(logits_dict[key], shifted_tgt[..., idx], mask) + train_loss_list.append(training_loss) + total_loss = sum(train_loss_list) / len(train_loss_list) + return total_loss + +########################### Learning rate Scheduler ################################ +''' +This scheduler is from https://gaussian37.github.io/dl-pytorch-lr_scheduler/#custom-cosineannealingwarmrestarts-1 +It's basically a cosine annealing scheduler with warm restarts including two methods, warm up start and reducing maximum lr. +''' + +class CosineAnnealingWarmUpRestarts(_LRScheduler): + def __init__(self, optimizer, T_0, T_mult=1, eta_max=0.1, T_up=0, gamma=1., last_epoch=-1, eta_min=0): + if T_0 <= 0 or not isinstance(T_0, int): + raise ValueError("Expected positive integer T_0, but got {}".format(T_0)) + if T_mult < 1 or not isinstance(T_mult, int): + raise ValueError("Expected integer T_mult >= 1, but got {}".format(T_mult)) + if T_up < 0 or not isinstance(T_up, int): + raise ValueError("Expected positive integer T_up, but got {}".format(T_up)) + self.T_0 = T_0 + self.T_mult = T_mult + self.base_eta_max = eta_max + self.eta_max = eta_max + self.T_up = T_up + self.T_i = T_0 + self.gamma = gamma + self.cycle = 0 + self.T_cur = last_epoch + super(CosineAnnealingWarmUpRestarts, self).__init__(optimizer, last_epoch) + + def get_lr(self): + if self.T_cur == -1: + return self.base_lrs + elif self.T_cur < self.T_up: + return [(self.eta_max - base_lr)*self.T_cur / self.T_up + base_lr for base_lr in self.base_lrs] + else: + return [base_lr + (self.eta_max - base_lr) * (1 + math.cos(math.pi * (self.T_cur-self.T_up) / (self.T_i - self.T_up))) / 2 + for base_lr in self.base_lrs] + + def step(self, epoch=None): + if epoch is None: + epoch = self.last_epoch + 1 + self.T_cur = self.T_cur + 1 + if self.T_cur >= self.T_i: + self.cycle += 1 + self.T_cur = self.T_cur - self.T_i + self.T_i = (self.T_i - self.T_up) * self.T_mult + self.T_up + else: + if epoch >= self.T_0: + if self.T_mult == 1: + self.T_cur = epoch % self.T_0 + self.cycle = epoch // self.T_0 + else: + n = int(math.log((epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult)) + self.cycle = n + self.T_cur = epoch - self.T_0 * (self.T_mult ** n - 1) / (self.T_mult - 1) + self.T_i = self.T_0 * self.T_mult ** (n) + else: + self.T_i = self.T_0 + self.T_cur = epoch + + self.eta_max = self.base_eta_max * (self.gamma**self.cycle) + self.last_epoch = math.floor(epoch) + for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): + param_group['lr'] = lr + +class CosineLRScheduler(_LRScheduler): + """Cosine LR scheduler. + Args: + optimizer (Optimizer): Torch optimizer. + warmup_steps (int): Number of warmup steps. + total_steps (int): Total number of steps. + lr_min_ratio (float): Minimum learning rate. + cycle_length (float): Cycle length. + """ + def __init__(self, optimizer: Optimizer, total_steps: int, warmup_steps: int, + lr_min_ratio: float = 0.0, cycle_length: float = 1.0): + self.warmup_steps = warmup_steps + assert self.warmup_steps >= 0 + self.total_steps = total_steps + assert self.total_steps >= 0 + self.lr_min_ratio = lr_min_ratio + self.cycle_length = cycle_length + super().__init__(optimizer) + + def _get_sched_lr(self, lr: float, step: int): + if step < self.warmup_steps: + lr_ratio = step / self.warmup_steps + lr = lr_ratio * lr + elif step <= self.total_steps: + s = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps) + lr_ratio = self.lr_min_ratio + 0.5 * (1 - self.lr_min_ratio) * \ + (1. + math.cos(math.pi * s / self.cycle_length)) + lr = lr_ratio * lr + else: + lr_ratio = self.lr_min_ratio + lr = lr_ratio * lr + return lr + + def get_lr(self): + return [self._get_sched_lr(lr, self.last_epoch) for lr in self.base_lrs] + + +class DispersiveLoss(nn.Module): + def __init__(self, loss_type='infonce_l2', tau=0.5, lambda_weight=0.5): + super().__init__() + self.loss_type = loss_type + self.tau = tau + self.lambda_weight = lambda_weight + + def forward(self, features, diffusion_loss): + """ + features: 批次特征矩阵,形状为 [batch_size, feature_dim] + diffusion_loss: 原扩散损失 + """ + batch_size = features.size(0) + + # 计算距离矩阵 + if self.loss_type == 'infonce_l2': + # 计算平方L2距离 + dist_matrix = torch.cdist(features, features, p=2) ** 2 + # 计算分散损失 + exp_dist = torch.exp(-dist_matrix / self.tau) + disp_loss = torch.log(exp_dist.mean()) + elif self.loss_type == 'hinge': + # Hinge损失,假设阈值epsilon=1.0 + dist_matrix = torch.cdist(features, features, p=2) + disp_loss = torch.max(torch.zeros_like(dist_matrix), 1.0 - dist_matrix).mean() + elif self.loss_type == 'covariance': + # 协方差损失 + normalized_features = (features - features.mean(dim=0)) / features.std(dim=0) + cov_matrix = torch.matmul(normalized_features.T, normalized_features) / batch_size + # 非对角线元素平方和 + mask = ~torch.eye(cov_matrix.size(0), dtype=torch.bool) + disp_loss = (cov_matrix[mask] ** 2).mean() + else: + raise ValueError("Unsupported loss type") + + # 总损失 = 扩散损失 + lambda * 分散损失 + total_loss = diffusion_loss + self.lambda_weight * disp_loss + return total_loss, disp_loss \ No newline at end of file diff --git a/Amadeus/trainer_accelerate.py b/Amadeus/trainer_accelerate.py new file mode 100644 index 0000000..3b7ccc2 --- /dev/null +++ b/Amadeus/trainer_accelerate.py @@ -0,0 +1,1012 @@ +from calendar import EPOCH, c +from multiprocessing import context +import time +import pickle +import os +from pathlib import Path +from typing import Union +from datetime import datetime +from omegaconf import OmegaConf +import random +import itertools + + +import torch +import torchaudio +from torch.utils.data import DataLoader +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data.distributed import DistributedSampler, Sampler + +# import accelerate +from accelerate import Accelerator, DistributedDataParallelKwargs +from accelerate.utils import set_seed +#====================================================================== + + +import wandb +from collections import defaultdict +from tqdm.auto import tqdm + +from .model_zoo import AmadeusModel +from .symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor +from .symbolic_encoding.data_utils import TuneCompiler +from .symbolic_encoding.decoding_utils import MidiDecoder4REMI +from .evaluation_utils import add_conti_in_valid +from .train_utils import NLLLoss4REMI + +from data_representation.vocab_utils import LangTokenVocab +class InfiniteSampler(Sampler): + def __init__(self, data_source, shuffle=True): + self.data_source = data_source + self.shuffle = shuffle + self.indices = list(range(len(data_source))) + if self.shuffle: + random.shuffle(self.indices) + self.infinite_iterator = itertools.cycle(self.indices) + + def __iter__(self): + return self.infinite_iterator + + def __len__(self): + return None # 表示无限长度 + +class LanguageModelTrainer: + def __init__( + self, + model: AmadeusModel, # The language model for music generation + optimizer: torch.optim.Optimizer, # Optimizer for updating model weights + scheduler: torch.optim.lr_scheduler._LRScheduler, # Learning rate scheduler + loss_fn: NLLLoss4REMI, # Loss function to compute the error + midi_decoder: MidiDecoder4REMI, # Decoder to convert model output into MIDI format + train_set: TuneCompiler, # Training dataset + valid_set: TuneCompiler, # Validation dataset + save_dir: str, # Directory to save models and logs + vocab: LangTokenVocab, # Vocabulary for tokenizing sequences + use_ddp: bool, # Whether to use Distributed Data Parallel (DDP) + use_fp16: bool, # Whether to use mixed-precision training (FP16) + world_size: int, # Total number of devices for distributed training + batch_size: int, # Batch size for training + infer_target_len: int, # Target length for inference generation + gpu_id: int, # GPU device ID for computation + sampling_method: str, # Sampling method for sequence generation + sampling_threshold: float, # Threshold for sampling decisions + sampling_temperature: float, # Temperature for controlling sampling randomness + config, # Configuration parameters (contains general, training, and inference settings) + model_checkpoint: Union[str, None] = None, # Path to a pre-trainmodl checkpoint (optional) + ): + # Save model, optimizer, and other configurations + self.model = model + self.optimizer = optimizer + self.scheduler = scheduler + self.loss_fn = loss_fn + + self.valid_set = valid_set + self.vocab = vocab + self.use_ddp = use_ddp + self.world_size = world_size + self.batch_size = batch_size + self.gpu_id = gpu_id + self.sampling_method = sampling_method + self.sampling_threshold = sampling_threshold + self.sampling_temperature = sampling_temperature + self.config = config + self.last_iter = 0 + + # Load pre-trained model if provided + if model_checkpoint: + # parse the model checkpoint iter + if isinstance(model_checkpoint, str): + if model_checkpoint.endswith('.pt'): + self.last_iter = int(model_checkpoint.split('/')[-1].split('_')[0][4:]) + checkpoint = torch.load(model_checkpoint, map_location='cpu') + # print state dict keys + print("Loading model checkpoint from", model_checkpoint) + print("Checkpoint keys:", checkpoint['model'].keys()) + if isinstance(self.model, DDP): + self.model.module.load_state_dict(checkpoint['model'], strict=False) + else: + + self.model.load_state_dict(checkpoint['model'], strict=False) + # Training hyperparameters from config + self.grad_clip = config.train_params.grad_clip + self.num_cycles_for_inference = config.train_params.num_cycles_for_inference + self.num_cycles_for_model_checkpoint = config.train_params.num_cycles_for_model_checkpoint + self.iterations_per_training_cycle = config.train_params.iterations_per_training_cycle + self.iterations_per_validation_cycle = config.train_params.iterations_per_validation_cycle + self.make_log = config.general.make_log + self.num_uncond_generation = config.inference_params.num_uncond_generation + self.num_cond_generation = config.inference_params.num_cond_generation + self.num_max_seq_len = infer_target_len + self.infer_and_log = config.general.infer_and_log + self.valid_loader = self.generate_data_loader(self.valid_set, shuffle=False, drop_last=True) + + # gradient accumulation + self.gradient_accumulation_steps = config.train_params.gradient_accumulation_steps + # Set up mixed-precision training (FP16) if enabled + if use_fp16: + self.use_fp16 = True + else: + self.use_fp16 = False + # Set up Distributed Data Parallel (DDP) if required + if use_ddp: + # prepare using accelerator + if self.use_fp16: + self.accelerator = Accelerator(mixed_precision='bf16', + step_scheduler_with_optimizer=False, + gradient_accumulation_steps=self.gradient_accumulation_steps, + kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)]) + else: + self.accelerator = Accelerator( + gradient_accumulation_steps=self.gradient_accumulation_steps, + step_scheduler_with_optimizer=False, + kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)]) + + with self.accelerator.main_process_first(): + self.train_set = train_set + self.train_loader = self.generate_data_loader(self.train_set, shuffle=False, drop_last=False) + self.accelerator.wait_for_everyone() + self.accelerator.print(f"Using {self.world_size} GPUs for training") + + self.model, self.optimizer, self.scheduler, self.train_loader = self.accelerator.prepare( + self.model, self.optimizer, self.scheduler, self.train_loader + ) + self.accelerator.wait_for_everyone() + # self.accelerator.init_trackers("nested_music_transformer", config) + set_seed(42) + self.device = self.accelerator.device + self.model.to(self.device) + # set up for logging + if self.accelerator.is_main_process: + save_dir = self.setup_log(config) + print("savwe",save_dir) + # Create directory for saving models and logs + self.save_dir = Path(save_dir) + self.save_dir.mkdir(exist_ok=True, parents=True) + self.set_save_out() + else: + self.train_set = train_set + # Create data loaders for training and validation sets + self.train_loader = self.generate_data_loader(train_set, shuffle=False, drop_last=True) + self.valid_loader = self.generate_data_loader(valid_set, shuffle=True, drop_last=True) + save_dir = self.setup_log(config) + # Create directory for saving models and logs + self.save_dir = Path(save_dir) + self.save_dir.mkdir(exist_ok=True, parents=True) + self.set_save_out() + + self.device = config.train_params.device + self.model.to(self.device) + + + # Initialize tracking metrics + self.best_valid_accuracy = 0 + self.best_valid_loss = 100 + self.training_loss = [] + self.validation_loss = [] + self.validation_acc = [] + + self.midi_decoder = midi_decoder + + + def generate_experiment_name(self, config): + # add base hyperparameters to the experiment name + dataset_name = config.dataset + encoding_name = config.nn_params.encoding_scheme + num_features = config.nn_params.num_features + input_embedder_name = config.nn_params.input_embedder_name + sub_decoder_name = config.nn_params.sub_decoder_name + batch_size = config.train_params.batch_size + num_layers = config.nn_params.main_decoder.num_layer + input_length = config.train_params.input_length + first_pred_feature = config.data_params.first_pred_feature + + # Add target hyperparameters to the experiment name + # dropout + main_dropout = config.nn_params.model_dropout + # learning rate + lr_decay_rate = config.train_params.decay_step_rate + + time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + # Combine the information into a single string for the experiment name + # experiment_name = f"{time}_{dataset_name}_{encoding_name}{num_features}_{input_embedder_name}_{sub_decoder_name}_firstpred:{first_pred_feature}_inputlen{input_length}_nlayer{num_layers}_batch{batch_size}\ + # _dropout{main_dropout}_lrdecay{lr_decay_rate}" + experiment_name = f"{time}_{dataset_name}_{encoding_name}{num_features}_{sub_decoder_name}_firstpred:{first_pred_feature}_inputlen{input_length}_nlayer{num_layers}_batch{batch_size}" + return experiment_name + + def collate_fn(self, batch): + """ + Custom collate function to handle variable-length sequences in a batch. + It pads sequences to the maximum length in the batch and returns a tuple of padded sequences and their lengths. + """ + # Unzip the batch into segments, masks, captions, and encoded captions + segments, masks, captions, encoded_captions = zip(*batch) + # print("collate_fn",len(segments),len(masks),len(captions),len(encoded_captions)) + # # Pad the segments and masks to the maximum length in the batch + # padded_segments = torch.nn.utils.rnn.pad_sequence(segments, batch_first=True) + # padded_masks = torch.nn.utils.rnn.pad_sequence(masks, batch_first=True) + # # Return padded segments and masks along with captions and encoded captions + segments = torch.stack(segments, dim=0) + masks = torch.stack(masks, dim=0) + print(captions) + print(encoded_captions) + # captions = torch.stack(captions, dim=0) + # encoded_captions = torch.stack(encoded_captions, dim=0) + return segments, masks, captions, encoded_captions + # return padded_segments, padded_masks, captions, encoded_captions + def setup_log(self, config): + if self.accelerator.is_main_process: + if config.general.make_log: + experiment_name =self.generate_experiment_name(config) + wandb.init( + project="Acce_Music_Transformer", + name=experiment_name, + config=OmegaConf.to_container(config) + ) + # 保存配置到 WANDB 根目录 + config_path = Path(wandb.run.dir) / "config.yaml" + OmegaConf.save(config, config_path) # 关键代码 + + save_dir = Path(wandb.run.dir) / "checkpoints" + save_dir.mkdir(exist_ok=True, parents=True) + else: + now = datetime.now() + save_dir = Path('wandb/debug/checkpoints') / now.strftime('%y-%m-%d') + save_dir.mkdir(exist_ok=True, parents=True) + # 保存配置到调试目录 + config_path = save_dir / "config.yaml" + OmegaConf.save(config, config_path) # 关键代码 + + return str(save_dir) + + # Set up the output directories for saving MIDI results during inference + def set_save_out(self): + if self.accelerator.is_main_process: + # copy from latest folder in wandb/debug/checkpoints + target_folder = 'wandb/debug/checkpoints' + latest_folder = sorted(Path(target_folder).iterdir(), key=os.path.getmtime)[-1] + # get files in the latest folder + files = [f for f in latest_folder.iterdir() if f.is_file()] + # copy files to the save_dir + for file in files: + # copy the file to the save_dir + target_file = self.save_dir / file.name + if not target_file.exists(): + os.system(f'cp {file} {target_file}') + if self.infer_and_log: + self.valid_out_dir = self.save_dir / 'valid_out' + os.makedirs(self.valid_out_dir, exist_ok=True) + + # Save the current model and optimizer state + def save_model(self, path): + if isinstance(self.model, DDP): + torch.save({'model': self.model.module.state_dict(), 'optim': self.optimizer.state_dict()}, path) + else: + torch.save({'model': self.model.state_dict(), 'optim': self.optimizer.state_dict()}, path) + + # Generate the data loader for either training or validation datasets + def generate_data_loader(self, dataset, shuffle=False, drop_last=False) -> DataLoader: + return DataLoader(dataset, shuffle=shuffle, batch_size=self.batch_size, drop_last=drop_last,collate_fn=None, pin_memory=True,num_workers=4, persistent_workers=True, prefetch_factor=2, worker_init_fn=None) + + # Training function based on a given number of iterations + def accelerate_train_by_num_iter(self, num_iters): + # generator = iter(self.train_loader) + pbar = tqdm(total=num_iters, desc='Training', unit='iteration', leave=False) + completed_steps = self.last_iter + # save init model + while completed_steps < num_iters: + total_loss = 0 + current_loss = 0 + for i, batch in enumerate(self.train_loader): + # gradient accumulation + + with self.accelerator.accumulate(self.model): + + # Start time for the training step,only for main process + start_time = time.time() + + # Tra\in the model on a single batch + # loss_value, loss_dict = self._accelerate_train_by_single_batch(batch) + loss, _, loss_dict = self._get_loss_pred_from_single_batch(batch) + total_loss += loss.detach().float() + current_loss = loss.detach().float() + # loss.backward() + self.accelerator.backward(loss) + if self.accelerator.sync_gradients: + self.accelerator.unscale_gradients(self.optimizer) + if self.accelerator.sync_gradients: + torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip) + # self.accelerator.clip_grad_norm_(self.model.parameters(), self.grad_clip) + if not isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) and self.scheduler is not None: + self.scheduler.step() + + self.optimizer.step() + self.optimizer.zero_grad() + + + if self.accelerator.sync_gradients: + # update progress bar + loss_value = loss.item() + # log in main process + completed_steps += 1 + + # if self.accelerator.is_main_process: + loss_dict['time'] = time.time() - start_time + loss_dict['lr'] = self.optimizer.param_groups[0]['lr'] + loss_dict = self._rename_dict(loss_dict, 'train') + self.training_loss.append(loss_value) + if self.accelerator.is_main_process: + pbar.update(1) + pbar.set_postfix(loss=loss_value, lr=self.optimizer.param_groups[0]['lr']) + # save iter1 checkpoint + if completed_steps == 1 and self.accelerator.is_main_process: + self.save_model(self.save_dir / f'iter{completed_steps}_loss{current_loss:.4f}.pt') + + # Log training loss at the specified training cycle + if (completed_steps + 1) % self.iterations_per_training_cycle == 0 and self.make_log and self.accelerator.is_main_process: + wandb.log(loss_dict, step=completed_steps) + + # Log training accuracy periodically + if (completed_steps + 1) % (self.iterations_per_training_cycle * 3) == 0 and self.make_log: + validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, correct_guess_by_feature = self._get_valid_loss_and_acc_from_batch(batch, train=True) + train_metric_dict = self._get_train_accuracy(num_nonmask_tokens, num_tokens_by_feature, correct_guess_by_feature) + train_metric_dict.update(loss_dict) + train_metric_dict = self._rename_dict(train_metric_dict, 'train') + if self.accelerator.is_main_process: + wandb.log(train_metric_dict, step=completed_steps) + # delete variables to avoid memory leakages + del validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, correct_guess_by_feature, train_metric_dict + + # Perform validation at the specified interval + if (completed_steps + 1) % self.iterations_per_validation_cycle == 0: + self.model.eval() + validation_loss, validation_acc, validation_metric_dict = self.validate() + validation_metric_dict['acc'] = validation_acc + validation_metric_dict = self._rename_dict(validation_metric_dict, 'valid') + if self.make_log and self.accelerator.is_main_process: + wandb.log(validation_metric_dict, step=completed_steps) + self.validation_loss.append(validation_loss) + self.validation_acc.append(validation_acc) + self.best_valid_loss = min(validation_loss, self.best_valid_loss) + + # Perform inference and logging after a certain number of cycles + if (completed_steps + 1) % (self.num_cycles_for_inference * self.iterations_per_validation_cycle) == 0 and self.infer_and_log and self.accelerator.is_main_process: + self.inference_and_log(i, self.num_uncond_generation, self.num_cond_generation, self.num_max_seq_len) + + # Save a model checkpoint periodically + if (completed_steps + 1) % (self.iterations_per_validation_cycle * self.num_cycles_for_model_checkpoint) == 0 and self.accelerator.is_main_process: + self.accelerator.print(f"Saving model checkpoint at iter {completed_steps}") + self.save_model(self.save_dir / f'iter{completed_steps}_loss{validation_loss:.4f}.pt') + self.model.train() + + # delete variables to avoid memory leakages + del validation_acc, validation_metric_dict + # else: + # self.accelerator.wait_for_everyone() + # Save the model checkpoint at the end of each epoch + if self.accelerator.is_main_process: + print(f"Saving model checkpoint at iter {completed_steps}") + # Save the model state + self.save_model(self.save_dir / f"iter{completed_steps}_loss{current_loss:.4f}.pt") + # Save the final model after training + self.accelerator.wait_for_everyone() + if self.accelerator.is_main_process: + print("saving last checkpoint") + self.save_model(self.save_dir / f'checkpoint_last.pt') + + # same as above but for accelerate + def _accelarate_get_loss_pred_from_single_batch(self, batch): + """ + Computes the loss and predictions for a single batch of data. + + Args: + batch: A batch of data, typically containing input sequences, targets, and masks. + + Returns: + loss: The computed loss for the batch. + logits: The raw model predictions (logits). + loss_dict: A dictionary containing the total loss. + + The method: + - Separates the input sequences and target sequences from the batch. + - Moves the data to the appropriate device. + - Applies mixed precision (FP16) if applicable. + - Computes the logits using the model and calculates the loss using the specified loss function. + """ + segment, mask, caption,encoded_caption = batch + input_seq, target = segment[:, :-1], segment[:, 1:] + input_seq = input_seq.to(self.device) + target = target.to(self.device) + mask = mask[:, :-1].to(self.device) + if self.use_fp16: + with torch.cuda.amp.autocast(): + logits = self.model(input_seq, target) + loss = self.loss_fn(logits, target, mask) + else: + logits = self.model(input_seq, None) + loss = self.loss_fn(logits, target, mask) + loss_dict = {'total': loss.item()} + return loss, logits, loss_dict + + + + def _train_by_single_batch(self, batch): + """ + Trains the model on a single batch of data. + + Args: + batch: A batch of data, typically consisting of input sequences and corresponding targets. + + Returns: + loss.item(): The total loss for this batch. + loss_dict: A dictionary containing information about the loss and other relevant metrics. + + The method: + - Calls `_get_loss_pred_from_single_batch` to compute the loss and predictions. + - Resets the optimizer's gradients. + - Depending on whether mixed precision (FP16) is used, it scales the loss and applies gradient clipping before stepping the optimizer. + - Updates the learning rate scheduler if applicable. + - Records the time taken for the training step and the current learning rate in the `loss_dict`. + """ + start_time = time.time() + loss, _, loss_dict = self._get_loss_pred_from_single_batch(batch) + self.optimizer.zero_grad() + if self.use_fp16: + self.scaler.scale(loss).backward() + self.scaler.unscale_(self.optimizer) + torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip) + self.scaler.step(self.optimizer) + self.scaler.update() + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.grad_clip) + self.optimizer.step() + if not isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) and self.scheduler is not None: + self.scheduler.step() + loss_dict['time'] = time.time() - start_time + loss_dict['lr'] = self.optimizer.param_groups[0]['lr'] + return loss.item(), loss_dict + + def _get_loss_pred_from_single_batch(self, batch): + """ + Computes the loss and predictions for a single batch of data. + + Args: + batch: A batch of data, typically containing input sequences, targets, and masks. + + Returns: + loss: The computed loss for the batch. + logits: The raw model predictions (logits). + loss_dict: A dictionary containing the total loss. + + The method: + - Separates the input sequences and target sequences from the batch. + - Moves the data to the appropriate device. + - Applies mixed precision (FP16) if applicable. + - Computes the logits using the model and calculates the loss using the specified loss function. + """ + segment, mask, caption,encoded_caption = batch + input_seq, target = segment[:, :-1], segment[:, 1:] + + input_seq = input_seq.to(self.device) + target = target.to(self.device) + mask = mask[:, :-1].to(self.device) + if self.use_fp16: + with self.accelerator.autocast(): + logits = self.model(input_seq, target) + loss = self.loss_fn(logits, target, mask) + else: + logits = self.model(input_seq, None) + loss = self.loss_fn(logits, target, mask) + loss_dict = {'total': loss.item()} + return loss, logits, loss_dict + + def _get_valid_loss_and_acc_from_batch(self, batch, train=False): + """ + Computes validation loss and accuracy from a single batch. + + Args: + batch: A batch of data, typically containing input sequences, targets, and masks. + train (bool): Indicator whether the function is being used in training mode. + + Returns: + validation_loss: Total validation loss for the batch. + num_tokens: The number of valid tokens in the batch. + loss_dict: A dictionary containing the loss and relevant metrics. + None: Placeholder for future implementation. + num_correct_guess: Number of correctly predicted tokens. + + The method: + - Calls `_get_loss_pred_from_single_batch` to compute the loss and predictions. + - Computes token-level accuracy by comparing predicted tokens with the targets. + """ + segment, mask, caption,encoded_caption = batch + input_seq, target = segment[:, :-1], segment[:, 1:] + loss, logits, loss_dict = self._get_loss_pred_from_single_batch(batch) + prob = torch.softmax(logits, dim=-1) + num_tokens = torch.sum(mask) + target = target.to(self.device) + mask = mask[:, :-1].to(self.device) + + selected_tokens = torch.argmax(prob, dim=-1) * mask + shifted_tgt_with_mask = target * mask + num_correct_guess = torch.sum(selected_tokens == shifted_tgt_with_mask) - torch.sum(mask == 0) + + validation_loss = loss.item() * num_tokens + num_correct_guess = num_correct_guess.item() + return validation_loss, num_tokens, loss_dict, None, num_correct_guess + + def _get_train_accuracy(self, num_tokens, num_tokens_by_feature, num_correct_guess): + """ + Computes training accuracy. + + Args: + num_tokens: Total number of tokens processed. + num_tokens_by_feature: Number of tokens for each feature (not used here). + num_correct_guess: Number of correctly predicted tokens. + + Returns: + Training accuracy, computed as the ratio of correct predictions to the total number of tokens. + """ + return num_correct_guess / num_tokens + + def validate(self, external_loader=None): + """ + Validates the model on a dataset. + + Args: + external_loader (DataLoader): If provided, an external DataLoader can be used for validation. + + Returns: + total_validation_loss: Average validation loss over all batches. + total_num_correct_guess: Total number of correct predictions divided by the number of tokens (accuracy). + validation_metric_dict: Dictionary of validation metrics averaged over all batches. + + The method: + - Iterates through the validation data loader, calculating the loss and accuracy for each batch. + - Aggregates the results over all batches and returns the overall validation metrics. + """ + if external_loader and isinstance(external_loader, DataLoader): + loader = external_loader + print('An arbitrary loader is used instead of Validation loader') + else: + loader = self.valid_loader + + self.model.eval() + total_validation_loss = 0 + total_num_correct_guess = 0 + total_num_tokens = 0 + validation_metric_dict = defaultdict(float) + with torch.inference_mode(): + for batch in tqdm(loader, leave=False): + validation_loss, num_tokens, loss_dict, _, num_correct_guess = self._get_valid_loss_and_acc_from_batch(batch) + total_validation_loss += validation_loss + total_num_tokens += num_tokens + total_num_correct_guess += num_correct_guess + for key, value in loss_dict.items(): + validation_metric_dict[key] += value * num_tokens + for key in validation_metric_dict.keys(): + validation_metric_dict[key] /= total_num_tokens + + return total_validation_loss / total_num_tokens, total_num_correct_guess / total_num_tokens, validation_metric_dict + + def _make_midi_from_generated_output(self, generated_output, iter, seed, condition=None): + """ + Generates a MIDI file and logs output from the generated sequence. + + Args: + generated_output: The sequence of notes generated by the model. + iter: The current iteration of the training process. + seed: The seed used for generating the sequence. + condition: Optional condition input for generating conditional output. + + The method: + - Converts the generated output into a MIDI file and logs it. + - Optionally logs additional error metrics and figures for analysis. + """ + if condition is not None: + path_addition = "cond_" + else: + path_addition = "" + with open(self.valid_out_dir / f"{path_addition}generated_output_{iter}_seed_{seed}.pkl", 'wb') as f: + pickle.dump(generated_output, f) + self.midi_decoder(generated_output, self.valid_out_dir / f"{path_addition}midi_decoded_{iter}_seed_{seed}.mid") + if self.make_log: + log_dict = {} + log_dict[f'{path_addition}gen_score'] = wandb.Image(str(self.valid_out_dir / f'{path_addition}midi_decoded_{iter}_seed_{seed}.png')) + log_dict[f'{path_addition}gen_audio'] = wandb.Audio(str(self.valid_out_dir / f'{path_addition}midi_decoded_{iter}_seed_{seed}.mp3')) + wandb.log(log_dict, step=(iter+seed)) + print(f"{path_addition}inference is logged: Iter {iter} / seed {seed}") + return generated_output + + @torch.inference_mode() + def inference_and_log(self, iter, num_uncond_generation=5, num_cond_generation=5, max_seq_len=10000): + """ + Generates and logs both unconditional and conditional output sequences. + + Args: + iter: The current iteration. + num_uncond_generation: Number of unconditional sequences to generate. + num_cond_generation: Number of conditional sequences to generate. + max_seq_len: Maximum sequence length to generate. + + The method: + - Generates unconditional and conditional sequences using the model's generation function. + - Converts the sequences into MIDI files and logs the generated results. + """ + self.model.eval() + for i in range(num_uncond_generation): + try: + start_time = time.time() + uncond_generated_output = self.model.module.generate(manual_seed=i, max_seq_len=max_seq_len, condition=None, \ + sampling_method=self.sampling_method, threshold=self.sampling_threshold, temperature=self.sampling_temperature) + if len(uncond_generated_output) == 0: continue + print(f"unconditional generation time_{iter}: {time.time() - start_time:.4f}") + print(f"unconditional length of generated_output: {uncond_generated_output.shape[1]}") + self._make_midi_from_generated_output(uncond_generated_output, iter, i, None) + except Exception as e: + print(e) + condition_list = [x[1] for x in self.valid_set.data_list[:num_cond_generation] ] + for i in range(num_cond_generation): + condition = self.valid_set.get_segments_with_tune_idx(condition_list[i], 0)[0] + try: + start_time = time.time() + generated_output = self.model.module.generate(manual_seed=i, max_seq_len=max_seq_len, condition=condition, \ + sampling_method=self.sampling_method, threshold=self.sampling_threshold, temperature=self.sampling_temperature) + if len(generated_output) == 0: continue + print(f"conditional generation time_{iter}: {time.time() - start_time:.4f}") + print(f"conditional length of generated_output: {generated_output.shape[1]}") + self._make_midi_from_generated_output(generated_output, iter+num_uncond_generation, i, condition) + except Exception as e: + print(e) + + def _rename_dict(self, adict, prefix='train'): + ''' + Renames the keys in a dictionary by adding a prefix. + ''' + keys = list(adict.keys()) + for key in keys: + adict[f'{prefix}.{key}'] = adict.pop(key) + return dict(adict) + +class LanguageModelTrainer4REMI(LanguageModelTrainer): + def __init__(self, model, optimizer, scheduler, loss_fn, midi_decoder, train_set, valid_set, save_dir, vocab, use_ddp, use_fp16, world_size, batch_size, infer_target_len, gpu_id, sampling_method, sampling_threshold, sampling_temperature, config): + super().__init__(model, optimizer, scheduler, loss_fn, midi_decoder, train_set, valid_set, save_dir, vocab, use_ddp, use_fp16, world_size, batch_size, infer_target_len, gpu_id, sampling_method, sampling_threshold, sampling_temperature, config) + + def _get_loss_pred_from_single_batch(self, batch, valid=False): + segment, mask, caption,encoded_caption = batch + input_seq, target = segment[:, :-1], segment[:, 1:] + input_seq = input_seq.to(self.device) + target = target.to(self.device) + mask = mask[:, :-1].to(self.device) + if self.use_fp16: + with self.accelerator.autocast(): + logits = self.model(input_seq, target) + if not valid: + total_loss, loss_dict = self.loss_fn(logits, target, mask, None) + return total_loss, logits, {'total':total_loss.item()} + else: + total_loss, loss_dict = self.loss_fn(logits, target, mask, self.vocab) + loss_dict['total'] = total_loss.item() + return total_loss, logits, loss_dict + else: + logits = self.model(input_seq, target) + if not valid: + total_loss, loss_dict = self.loss_fn(logits, target, mask, None) + return total_loss, logits, {'total':total_loss.item()} + else: + total_loss, loss_dict = self.loss_fn(logits, target, mask, self.vocab) + loss_dict['total'] = total_loss.item() + return total_loss, logits, loss_dict + + def _get_valid_loss_and_acc_from_batch(self, batch, train=False): + segment, mask, caption,encoded_caption = batch + mask = mask[:, :-1] + _, target = segment[:, :-1], segment[:, 1:] + loss, logits, loss_dict = self._get_loss_pred_from_single_batch(batch, valid=True) + prob = torch.softmax(logits, dim=-1) + num_nonmask_tokens = torch.sum(mask) # [b, t] + target = target.to(self.device) # [b, t] + mask = mask.to(self.device) + + prob_with_mask = torch.argmax(prob, dim=-1) * mask # [b, t] + shifted_tgt_with_mask = target * mask # [b, t] + + correct_guess_by_feature = defaultdict(int) + num_tokens_by_feature = defaultdict(int) + tokens_idx = prob_with_mask.flatten(0,1) # [b*t] + answers_idx = shifted_tgt_with_mask.flatten(0,1) # [b*t] + if self.vocab.encoding_scheme == 'remi': + eos_idx = 2 + for feature in self.vocab.feature_list: + feature_mask = self.vocab.total_mask[feature].to(self.device) # [327,] + mask_for_target = feature_mask[answers_idx] # [b*t] + if feature == 'type': # because Bar token is 0, we need to add 1 to calculate accuracy + valid_pred = (tokens_idx+1) * mask_for_target + valid_answers = (answers_idx+1) * mask_for_target + eos_mask = valid_answers != eos_idx # because EOS is also working as a padding + correct_guess_by_feature[feature] += torch.sum(valid_pred[eos_mask] == valid_answers[eos_mask]).item() - torch.sum(mask_for_target[eos_mask] == 0).item() + num_tokens_by_feature[feature] += torch.sum(mask_for_target[eos_mask]).item() + else: + valid_pred = tokens_idx * mask_for_target # [b, t] + valid_answers = answers_idx * mask_for_target # [b, t] + correct_guess_by_feature[feature] += torch.sum(valid_pred == valid_answers).item() - torch.sum(mask_for_target == 0).item() + num_tokens_by_feature[feature] += torch.sum(mask_for_target).item() + validation_loss = loss.item() * num_nonmask_tokens.item() + return validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, correct_guess_by_feature + + def _get_train_accuracy(self, num_tokens, num_tokens_by_feature, num_correct_guess_by_feature): + total_num_correct_guess = 0 + total_num_tokens = 0 + acc_dict = {} + for feature, num_correct_guess in num_correct_guess_by_feature.items(): + if feature == 'type': + continue + total_num_correct_guess += num_correct_guess + total_num_tokens += num_tokens_by_feature[feature] + if num_tokens_by_feature[feature] == 0: + continue + acc_dict[f"{feature}_acc"] = num_correct_guess / num_tokens_by_feature[feature] + total_accuracy = total_num_correct_guess / total_num_tokens + acc_dict['total_acc'] = total_accuracy + return acc_dict + + def validate(self, external_loader=None): + ''' + total_num_tokens: for calculating loss, nonmask tokens + total_num_valid_tokens: for calculating accuracy, valid tokens + ''' + if external_loader and isinstance(external_loader, DataLoader): + loader = external_loader + print('An arbitrary loader is used instead of Validation loader') + else: + loader = self.valid_loader + + self.model.eval() + total_validation_loss = 0 + total_num_tokens = 0 + total_num_valid_tokens = 0 + total_num_correct_guess = 0 + validation_metric_dict = defaultdict(float) + total_num_tokens_by_feature = defaultdict(int) + total_num_correct_guess_dict = defaultdict(int) + with torch.inference_mode(): + for num_iter, batch in enumerate(tqdm(loader, leave=False)): + if num_iter == len(self.valid_loader): + if loader is not self.valid_loader: # when validate with train_loader + break + validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, num_correct_guess_by_feature = self._get_valid_loss_and_acc_from_batch(batch) + total_validation_loss += validation_loss + total_num_tokens += num_nonmask_tokens.item() + for key, num_tokens in num_tokens_by_feature.items(): + total_num_tokens_by_feature[key] += num_tokens + if key == 'type': + continue + total_num_valid_tokens += num_tokens # num tokens are all the same for each musical type, torch.sum(mask) + for key, num_correct_guess in num_correct_guess_by_feature.items(): + total_num_correct_guess_dict[key] += num_correct_guess + if key == 'type': + continue + total_num_correct_guess += num_correct_guess + for key, value in loss_dict.items(): + if key == 'total': + validation_metric_dict[key] += value * num_nonmask_tokens + else: + feature_name = key.split('_')[0] + validation_metric_dict[key] += value * num_tokens_by_feature[feature_name] + + for key in validation_metric_dict.keys(): + if key == 'total': + validation_metric_dict[key] /= total_num_tokens + else: + feature_name = key.split('_')[0] + if total_num_tokens_by_feature[feature_name] == 0: + continue + validation_metric_dict[key] /= total_num_tokens_by_feature[feature_name] + + for key in total_num_tokens_by_feature.keys(): + num_tokens = total_num_tokens_by_feature[key] + num_correct = total_num_correct_guess_dict[key] + if num_tokens == 0: + continue + validation_metric_dict[f'{key}_acc'] = num_correct / num_tokens + return total_validation_loss / total_num_tokens, total_num_correct_guess / total_num_valid_tokens, validation_metric_dict + +class LanguageModelTrainer4CompoundToken(LanguageModelTrainer): + def __init__(self, model, optimizer, scheduler, loss_fn, midi_decoder, train_set, valid_set, save_dir, vocab, use_ddp, use_fp16, world_size, batch_size, infer_target_len, gpu_id, sampling_method, sampling_threshold, sampling_temperature, config): + super().__init__(model, optimizer, scheduler, loss_fn, midi_decoder, train_set, valid_set, save_dir, vocab, use_ddp, use_fp16, world_size, batch_size, infer_target_len, gpu_id, sampling_method, sampling_threshold, sampling_temperature, config) + + ''' + About ignore_token and conti_token: + During validation, tokens with this "conti" value are ignored when calculating accuracy or other metrics, + ensuring that repeated values don't unfairly skew the results. + This is especially relevant for features like beat, chord, tempo, and instrument where repeated tokens may have a specific musical meaning. + + We used ignore_token and conti_token to fairly compare compound token based encoding with REMI encoding. + ''' + + def _get_num_valid_and_correct_tokens(self, prob, ground_truth, mask, ignore_token=None, conti_token=None): + valid_prob = torch.argmax(prob, dim=-1) * mask + valid_ground_truth = ground_truth * mask + + if ignore_token is None and conti_token is None: + num_valid_tokens = torch.sum(mask) + num_correct_tokens = torch.sum(valid_prob == valid_ground_truth) - torch.sum(mask == 0) + elif ignore_token is not None and conti_token is None: + ignore_mask = valid_ground_truth != ignore_token # batch x seq_len + num_valid_tokens = torch.sum(ignore_mask) + num_correct_tokens = torch.sum(valid_prob[ignore_mask] == valid_ground_truth[ignore_mask]) # by using mask, the tensor becomes 1d + elif ignore_token is not None and conti_token is not None: + ignore_conti_mask = (valid_ground_truth != ignore_token) & (valid_ground_truth != conti_token) + num_valid_tokens = torch.sum(ignore_conti_mask) + num_correct_tokens = torch.sum(valid_prob[ignore_conti_mask] == valid_ground_truth[ignore_conti_mask]) + return num_correct_tokens.item(), num_valid_tokens.item() + + def _get_loss_pred_from_single_batch(self, batch, valid=False): + # print(batch) + segment, mask, caption,encoded_caption = batch + input_seq, target = segment[:, :-1], segment[:, 1:] + input_seq = input_seq.to(self.device) + target = target.to(self.device) + mask = mask[:, :-1].to(self.device) + encoded_caption = encoded_caption.to(self.device) + if self.use_fp16: + if self.config.use_diff is True: + with self.accelerator.autocast(): + # breakpoint() + (logits_dict, (masked_indices, p_mask)),input_dict = self.model(input_seq, target,context=encoded_caption) + if self.config.use_dispLoss == True: + total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, masked_indices, p_mask, valid, input_dict=input_dict,lambda_weight=self.config.lambda_weight,tau=self.config.tau) + else: + total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, masked_indices, p_mask, valid) + else: + with self.accelerator.autocast(): + logits_dict,_ = self.model(input_seq, target,context=encoded_caption) + total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, valid) + else: + if self.config.use_diff is True: + # breakpoint() + if self.config.use_dispLoss == True: + total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, masked_indices, p_mask, valid, input_dict=input_dict,lambda_weight=self.config.lambda_weight) + else: + total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, masked_indices, p_mask, valid) + else: + logits_dict, input_Dict = self.model(input_seq, target,context=encoded_caption) + total_loss, loss_dict = self.loss_fn(logits_dict, target, mask, valid) + if valid: + loss_dict['total'] = total_loss.item() + else: + loss_dict = {'total':total_loss.item()} + return total_loss, logits_dict, loss_dict + + def _get_valid_loss_and_acc_from_batch(self, batch, train=False): + ''' + in this method, valid means handled with both ignore token and mask + when valid tokens with only mask, it is called num_nonmask_tokens + + input_seq, target: batch x seq_len x num_features + mask: batch x seq_len, 0 for padding + prob: batch x seq_len x total_vocab_size + ''' + segment, mask, caption,encoded_caption = batch + input_seq, target = segment[:, :-1], segment[:, 1:] + total_loss, logits_dict, loss_dict = self._get_loss_pred_from_single_batch(batch, valid=True) + probs_dict = {key:torch.softmax(value, dim=-1) for key, value in logits_dict.items()} + num_nonmask_tokens = torch.sum(mask) + input_seq = input_seq.to(self.device) + target = add_conti_in_valid(target, self.config.nn_params.encoding_scheme).to(self.device) + mask = mask[:, :-1].to(self.device) + + correct_guess_by_feature = defaultdict(int) + num_tokens_by_feature = defaultdict(int) + for idx, key in enumerate(self.vocab.feature_list): + if key == 'type': + num_correct_tokens, num_valid_tokens = self._get_num_valid_and_correct_tokens(probs_dict[key], target[..., idx], mask, ignore_token=None, conti_token=None) + elif key == 'chord' or key == 'tempo' or key == 'instrument': + num_correct_tokens, num_valid_tokens = self._get_num_valid_and_correct_tokens(probs_dict[key], target[..., idx], mask, ignore_token=0, conti_token=9999) + elif key == 'beat': + # NB's beat vocab has Ignore and CONTI token + # CP's beat vocab has Ignore and BAR token, we exclude BAR token in accuracy calculation for parity with NB + num_correct_tokens, num_valid_tokens = self._get_num_valid_and_correct_tokens(probs_dict[key], target[..., idx], mask, ignore_token=0, conti_token=9999) + else: + num_correct_tokens, num_valid_tokens = self._get_num_valid_and_correct_tokens(probs_dict[key], target[..., idx], mask, ignore_token=0, conti_token=None) + correct_guess_by_feature[key] = num_correct_tokens + num_tokens_by_feature[key] = num_valid_tokens + validation_loss = total_loss.item() * num_nonmask_tokens.item() + return validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, correct_guess_by_feature + + def _get_train_accuracy(self, num_tokens, num_tokens_by_feature, num_correct_guess_by_feature): + total_num_correct_guess = 0 + total_num_tokens = 0 + acc_dict = {} + for feature, num_correct_guess in num_correct_guess_by_feature.items(): + if feature == 'type': + continue + total_num_correct_guess += num_correct_guess + total_num_tokens += num_tokens_by_feature[feature] + acc_dict[f"{feature}_acc"] = num_correct_guess / num_tokens_by_feature[feature] + total_accuracy = total_num_correct_guess / total_num_tokens + acc_dict['total_acc'] = total_accuracy + return acc_dict + + def validate(self, external_loader=None): + if external_loader and isinstance(external_loader, DataLoader): + loader = external_loader + print('An arbitrary loader is used instead of Validation loader') + else: + loader = self.valid_loader + + self.model.eval() + total_validation_loss = 0 + total_num_correct_guess = 0 + total_num_tokens = 0 + total_num_valid_tokens = 0 + validation_metric_dict = defaultdict(float) + total_num_tokens_by_feature = defaultdict(int) + total_num_correct_guess_dict = defaultdict(int) + + with torch.inference_mode(): + ''' + mask is used to calculate loss, accuracy + validation_loss: sum of loss for valid tokens conditioned on mask + num_nonmask_tokens: sum of tokens conditioned on mask + num_tokens_by_feature: sum of valid tokens(handle ignore) for each musical features + num_correct_guess_by_feature: sum of correct tokens(handle ignore) for each musical features + ''' + for num_iter, batch in tqdm(enumerate(loader), leave=False): + if num_iter == len(self.valid_loader): + if loader is not self.valid_loader: # when validate with train_loader + break + validation_loss, num_nonmask_tokens, loss_dict, num_tokens_by_feature, num_correct_guess_by_feature = self._get_valid_loss_and_acc_from_batch(batch) + total_validation_loss += validation_loss + total_num_tokens += num_nonmask_tokens + for key, num_tokens in num_tokens_by_feature.items(): + total_num_tokens_by_feature[key] += num_tokens + if key == 'type': # because cp and nb have different number of type tokens, we don't want to calculate accuracy for type token + continue + total_num_valid_tokens += num_tokens # num tokens are all the same for each musical type, torch.sum(mask) + for key, num_correct_guess in num_correct_guess_by_feature.items(): + total_num_correct_guess_dict[key] += num_correct_guess + if key == 'type': + continue + total_num_correct_guess += num_correct_guess + for key, value in loss_dict.items(): + if key == 'total': + validation_metric_dict[key] += value * num_nonmask_tokens + else: + # if torch.isnan(value): # in case num valid tokens is 0 because of mask + # continue + feature_name = key.split('_')[0] + validation_metric_dict[key] += value * num_tokens_by_feature[feature_name] + + for key in validation_metric_dict.keys(): + if key == 'total': + validation_metric_dict[key] /= total_num_tokens + else: + feature_name = key.split('_')[0] + if total_num_tokens_by_feature[feature_name] == 0: + continue + validation_metric_dict[key] /= total_num_tokens_by_feature[feature_name] + for (key_t, num_tokens), (key_c, num_correct) in zip(total_num_tokens_by_feature.items(), total_num_correct_guess_dict.items()): + validation_metric_dict[f'{key_c}_acc'] = num_correct / num_tokens + + return total_validation_loss / (total_num_tokens + 1), total_num_correct_guess / (1+total_num_valid_tokens), validation_metric_dict + + def _make_midi_from_generated_output(self, generated_output, iter, seed, condition=None): + if self.config.data_params.first_pred_feature != 'type' and self.config.nn_params.encoding_scheme == 'nb': + generated_output = reverse_shift_and_pad_for_tensor(generated_output, self.config.data_params.first_pred_feature) + if condition is not None: + path_addition = "cond_" + else: + path_addition = "" + + # save generated_output as pickle + with open(self.valid_out_dir / f"{path_addition}generated_output_{iter}_seed_{seed}.pkl", 'wb') as f: + pickle.dump(generated_output, f) + self.midi_decoder(generated_output, self.valid_out_dir / f"{path_addition}midi_decoded_{iter}_seed_{seed}.mid") + if self.make_log and self.infer_and_log: + log_dict = {} + log_dict[f'{path_addition}gen_score'] = wandb.Image(str(self.valid_out_dir / f'{path_addition}midi_decoded_{iter}_seed_{seed}.png')) + log_dict[f'{path_addition}gen_audio'] = wandb.Audio(str(self.valid_out_dir / f'{path_addition}midi_decoded_{iter}_seed_{seed}.mp3')) + wandb.log(log_dict, step=(iter+seed)) + print(f"{path_addition}inference is logged: Iter {iter} / seed {seed}") + diff --git a/Amadeus/transformer_utils.py b/Amadeus/transformer_utils.py new file mode 100644 index 0000000..3982341 --- /dev/null +++ b/Amadeus/transformer_utils.py @@ -0,0 +1,949 @@ +import torch +import torch.nn as nn + +from x_transformers import Decoder, Encoder, PrefixDecoder, CrossAttender +from transformers import T5EncoderModel +from data_representation.vocab_utils import LangTokenVocab + +class PosEncoding(nn.Module): + def __init__(self, emb_size, max_t): + super().__init__() + self.emb_size =emb_size + self.max_t = max_t + self.register_buffer('encoding', self._prepare_emb()) + + def _prepare_emb(self): + dim_axis = 10000**(torch.arange(self.emb_size//2) * 2 / self.emb_size) # 10000 ** (normalized values between 0~1 num_emb_dim) + timesteps = torch.arange(self.max_t) + pos_enc_in = timesteps.unsqueeze(1) / dim_axis.unsqueeze(0) + pos_enc_sin = torch.sin(pos_enc_in) # x values for sin are between 0 ~ 1 so the values could never be the same + pos_enc_cos = torch.cos(pos_enc_in) + + pos_enc = torch.stack([pos_enc_sin, pos_enc_cos], dim=-1).reshape([self.max_t, self.emb_size]) + return pos_enc + + def forward(self, x): + return self.encoding[x] + +class ResidualLayerNormModule(nn.Module): + def __init__(self, submodule): + super().__init__() + self.submodule = submodule + self.layer_norm = nn.LayerNorm(self.submodule.input_size) + + def forward(self, x, mask=None, y=None): + if y is not None: + res_x = self.submodule(x, y, mask) + elif mask is not None: + res_x = self.submodule(x, mask) + else: + res_x = self.submodule(x) + x = x + res_x + return self.layer_norm(x) + +class SingleEmbedding(nn.Module): + def __init__( + self, + vocab, + dim_model, + ): + ''' + Embedding layer for REMI + ''' + super().__init__() + vocab_size = vocab.get_vocab_size() + self.embedding = nn.Embedding(vocab_size, dim_model) + + def forward(self, x): + return self.embedding(x) + +class MultiEmbedding(nn.Module): + def __init__( + self, + vocab:LangTokenVocab, + dim_model:int, + ): + super().__init__() + ''' + Embedding layer for compound tokens + ''' + self.vocab_size = vocab.get_vocab_size() + self.feature_list = vocab.feature_list + self.dim_model = dim_model + self.layers = [] + + self._make_emb_layers() + self._init_params() + self._make_emb_boundaries_by_key() + + def _init_params(self): + # apply kaiming init + for layer in self.layers: + if isinstance(layer, nn.Embedding): + nn.init.kaiming_normal_(layer.weight) + + def _make_emb_layers(self): + vocab_sizes = [self.vocab_size[key] for key in self.feature_list] + self.embedding_sizes = [self.dim_model for _ in self.feature_list] + for vocab_size, embedding_size in zip(vocab_sizes, self.embedding_sizes): + if embedding_size != 0: + self.layers.append(nn.Embedding(vocab_size, embedding_size)) + self.layers = nn.ModuleList(self.layers) + + def _make_emb_boundaries_by_key(self): + ''' + This function returns dict of boundaries for each embedding layer + ''' + self.emb_boundary_by_key = {} + start_idx = 0 + for key, emb_size in zip(self.feature_list, self.embedding_sizes): + if emb_size != 0: + self.emb_boundary_by_key[key] = (start_idx, start_idx + emb_size) + start_idx += emb_size + + def forward(self, x): + emb = torch.cat([module(x[..., i]) for i, module in enumerate(self.layers)], dim=-1) + return emb + + def __len__(self): + return len(self.layers) + + def get_emb_by_key(self, key, token): + layer_idx = self.feature_list.index(key) + return self.layers[layer_idx](token) + +class SummationEmbedder(MultiEmbedding): + def __init__( + self, + vocab:LangTokenVocab, + dim_model:int + ): + super().__init__(vocab, dim_model) + + def forward(self, seq): + emb_list = [module(seq[..., i]) for i, module in enumerate(self.layers)] + stacked_emb = torch.stack(emb_list, dim=2) # B x T x num_features x emb_size + output = torch.sum(stacked_emb, dim=2) # B x T x emb_size + return output + +class AverageEmbedder(MultiEmbedding): + def __init__( + self, + vocab:LangTokenVocab, + dim_model:int + ): + super().__init__(vocab, dim_model) + + def forward(self, seq): + emb_list = [module(seq[..., i]) for i, module in enumerate(self.layers)] + stacked_emb = torch.stack(emb_list, dim=2) # B x T x num_features x emb_size + output = torch.mean(stacked_emb, dim=2) # B x T x emb_size + return output + +class SelfAttentionEmbedder(MultiEmbedding): + def __init__( + self, + vocab:LangTokenVocab, + dim_model:int + ): + super().__init__(vocab, dim_model) + self.dropout = 0.1 + + self.transformer_encoder = Encoder( + dim = dim_model, + depth = 1, + heads = 8, + attn_dropout = self.dropout, + ff_dropout = self.dropout, + attn_flash = True) + + self.cls_embedding = nn.Parameter(torch.zeros(1, 1, self.dim_model), requires_grad=True) + + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff() + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn() + + def _add_dropout_after_attn(self): + for layer in self.transformer_encoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(self.dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(self.dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self): + for layer in self.transformer_encoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(self.dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_encoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def _apply_window_on_input_vec(self, embeddings): + window_size = 1 + zero_vec = torch.zeros(embeddings.shape[0], window_size-1, embeddings.shape[2], embeddings.shape[3]).to(embeddings.device) # B x (window_size-1) x num_features x emb_size + window_applied_input_vec = torch.cat([zero_vec, embeddings], dim=1) # B x (T+window_size-1) x num_features x emb_size + window_applied_input_vec = window_applied_input_vec.unfold(1, window_size, 1) # B x T x window_size x emb_size x num_features + window_applied_input_vec = window_applied_input_vec.transpose(3, 4) # B x T x window_size x num_features x emb_size + window_applied_input_vec = window_applied_input_vec.reshape(embeddings.shape[0]*embeddings.shape[1], -1, embeddings.shape[3]) # (B*T) x (num_features*window_size) x emb_size + return window_applied_input_vec + + def _apply_pos_enc(self, tgt): + pos = torch.arange(tgt.shape[1]).to(tgt.device) # (num_features*window_size+1) + pos = pos.unsqueeze(0).repeat(tgt.shape[0], 1) # (B*T) x (num_features*window_size+1) + tgt_pos = tgt + self.pos_enc(pos.long()) # (B*T) x (num_features*window_size+1) x emb_size + return tgt_pos + + def forward(self, input_tokens): + ''' + input_tokens: B x T x num_features + ''' + # prepare input vector + emb_list = [module(input_tokens[..., i]) for i, module in enumerate(self.layers)] # B x T x 1 x emb_size + stacked_emb = torch.stack(emb_list, dim=2) # B x T x num_features x emb_size + # apply window + stacked_emb = self._apply_window_on_input_vec(stacked_emb) + # add CLS + cls = self.cls_embedding.repeat(stacked_emb.shape[0], 1, 1) # (B*T) x 1 x emb_size + input_emb = torch.cat([stacked_emb, cls], dim=1) # (B*T) x (num_features*window_size+1) x emb_size + output = self.transformer_encoder(input_emb) # (B*T) x (num_features*window_size+1) x emb_size + # extract CLS + output = output[:, -1, :].reshape((input_tokens.shape[0], input_tokens.shape[1], -1)) # B x T x emb_size + return output + +class RVQMultiEmbedding(nn.Module): + def __init__( + self, + vocab:LangTokenVocab, + dim_model:int + ): + super().__init__() + self.vocab_size = vocab.get_vocab_size() + self.dim_model = dim_model + self.features = vocab.feature_list + self.layers = [] + self._make_emb_layers() + + def _make_emb_layers(self): + vocab_sizes = [self.vocab_size[key] for key in self.features] + self.embedding_sizes = [self.dim_model for _ in self.features] + for vocab_size, embedding_size in zip(vocab_sizes, self.embedding_sizes): + if embedding_size != 0: + self.layers.append(nn.Embedding(vocab_size, embedding_size)) + self.layers = nn.ModuleList(self.layers) + + def forward(self, x): + embeddings = torch.zeros(x.shape[0], x.shape[1], self.dim_model).to(x.device) + emb_list = [module(x[:, (idx+1)%4::4]) for idx, module in enumerate(self.layers)] + for idx, emb in enumerate(emb_list): + embeddings[:, (idx+1)%4::4] = emb + return embeddings + + def get_emb_by_key(self, key:str, token:torch.Tensor): + layer_idx = self.features.index(key) + return self.layers[layer_idx](token) + +class XtransformerDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = Decoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None,context_embedding=None): + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True) + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True) + return hidden_vec, intermediates + else: + return self.transformer_decoder(seq) + +class XtransformerCrossAttendDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base') + # frozen text encoder + for param in self.text_encoder.parameters(): + param.requires_grad = False + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = Decoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True, + cross_attend = True, + only_cross = False) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None,context_embedding=None): + assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder' + if context_embedding is None: + input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids'] + attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask'] + assert input_ids is not None, 'input_ids should be provided for prefix decoder' + assert attention_mask is not None, 'attention_mask should be provided for prefix decoder' + assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder' + + context = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask + ).last_hidden_state + else: + context = context_embedding + + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True, context=context) + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, context=context, return_hiddens=True) + return hidden_vec, intermediates + else: + return self.transformer_decoder(seq, context=context) + +class XtransformerLargeCrossAttendDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-large') + # frozen text encoder + for param in self.text_encoder.parameters(): + param.requires_grad = False + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = Decoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True, + cross_attend = True, + only_cross = False) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None,context_embedding=None): + assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder' + if context_embedding is None: + input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids'] + attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask'] + assert input_ids is not None, 'input_ids should be provided for prefix decoder' + assert attention_mask is not None, 'attention_mask should be provided for prefix decoder' + assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder' + + context = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask + ).last_hidden_state + else: + context = context_embedding + + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True, context=context) + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, context=context, return_hiddens=True) + return hidden_vec, intermediates + else: + return self.transformer_decoder(seq, context=context) + +class NewCrossAttendDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base') + # frozen text encoder + for param in self.text_encoder.parameters(): + param.requires_grad = False + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = Decoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True, + cross_attend = True, + only_cross = False, + use_rmsnorm=True, + ff_swish = True, # set this to True + ff_glu = True, # set to true to use for all feedforwards + ) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None,context_embedding=None): + assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder' + if context_embedding is None: + input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids'] + attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask'] + assert input_ids is not None, 'input_ids should be provided for prefix decoder' + assert attention_mask is not None, 'attention_mask should be provided for prefix decoder' + assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder' + + context = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask + ).last_hidden_state + else: + context = context_embedding + + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True, context=context) + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, context=context, return_hiddens=True) + return hidden_vec, intermediates + else: + return self.transformer_decoder(seq, context=context) + +class NewCrossAttendwithRoPEDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base') + # frozen text encoder + for param in self.text_encoder.parameters(): + param.requires_grad = False + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = Decoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True, + cross_attend = True, + only_cross = False, + use_rmsnorm=True, + rotary_pos_emb = True, + ff_swish = True, # set this to True + ff_glu = True, # set to true to use for all feedforwards + ) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None,context_embedding=None): + assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder' + if context_embedding is None: + input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids'] + attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask'] + assert input_ids is not None, 'input_ids should be provided for prefix decoder' + assert attention_mask is not None, 'attention_mask should be provided for prefix decoder' + assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder' + + context = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask + ).last_hidden_state + else: + context = context_embedding + + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True, context=context) + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, context=context, return_hiddens=True) + return hidden_vec, intermediates + else: + return self.transformer_decoder(seq, context=context) + +class XtransformerPrefixDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base') + # frozen text encoder + for param in self.text_encoder.parameters(): + param.requires_grad = False + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = PrefixDecoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None): + assert context is not None, 'context should be provided for prefix decoder' + input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids'] + attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask'] + assert input_ids is not None, 'input_ids should be provided for prefix decoder' + assert attention_mask is not None, 'attention_mask should be provided for prefix decoder' + assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder' + context = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask + ).last_hidden_state + + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True) + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True) + return hidden_vec, intermediates + else: + return self.transformer_decoder(seq) + +class XtransformerPretrainingDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base') + # frozen text encoder + for param in self.text_encoder.parameters(): + param.requires_grad = False + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = Decoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None, context_embedding=None): + + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True) + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True) + return hidden_vec, intermediates + else: + return self.transformer_decoder(seq) + +class XtransformerFinetuningDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-base') + # frozen text encoder + for param in self.text_encoder.parameters(): + param.requires_grad = False + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = Decoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None,context_embedding=None): + assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder' + if context_embedding is None: + input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids'] + attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask'] + assert input_ids is not None, 'input_ids should be provided for prefix decoder' + assert attention_mask is not None, 'attention_mask should be provided for prefix decoder' + assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder' + + context = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + ).last_hidden_state + else: + context = context_embedding + + # concatenate context with seq + seq = torch.cat([context, seq], dim=1) # B x (T+context_length) x emb_size + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True) + # cut to only return the seq part + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True) + # cut to only return the seq part + hidden_vec = hidden_vec[:, context.shape[1]:, :] + return hidden_vec, intermediates + else: + # cut to only return the seq part + hidden_vec = self.transformer_decoder(seq) + hidden_vec = hidden_vec[:, context.shape[1]:, :] + return hidden_vec + +class XtransformerLargeFinetuningDecoder(nn.Module): + def __init__( + self, + dim:int, + depth:int, + heads:int, + dropout:float + ): + super().__init__() + self._make_decoder_layer(dim, depth, heads, dropout) + self.text_encoder = T5EncoderModel.from_pretrained('google/flan-t5-large') + # frozen text encoder + for param in self.text_encoder.parameters(): + param.requires_grad = False + + def _make_decoder_layer(self, dim, depth, heads, dropout): + self.transformer_decoder = Decoder( + dim = dim, + depth = depth, + heads = heads, + attn_dropout = dropout, + ff_dropout = dropout, + attn_flash = True) + # add final dropout + print('Applying Xavier Uniform Init to x-transformer following torch.Transformer') + self._apply_xavier_init() + print('Adding dropout after feedforward layer in x-transformer') + self._add_dropout_after_ff(dropout) + print('Adding dropout after attention layer in x-transformer') + self._add_dropout_after_attn(dropout) + + def _add_dropout_after_attn(self, dropout): + for layer in self.transformer_decoder.layers: + if 'Attention' in str(type(layer[1])): + if isinstance(layer[1].to_out, nn.Sequential): # if GLU + layer[1].to_out.append(nn.Dropout(dropout)) + elif isinstance(layer[1].to_out, nn.Linear): # if simple linear + layer[1].to_out = nn.Sequential(layer[1].to_out, nn.Dropout(dropout)) + else: + raise ValueError('to_out should be either nn.Sequential or nn.Linear') + + def _add_dropout_after_ff(self, dropout): + for layer in self.transformer_decoder.layers: + if 'FeedForward' in str(type(layer[1])): + layer[1].ff.append(nn.Dropout(dropout)) + + def _apply_xavier_init(self): + for name, param in self.transformer_decoder.named_parameters(): + if 'to_q' in name or 'to_k' in name or 'to_v' in name: + torch.nn.init.xavier_uniform_(param, gain=0.5**0.5) + + def forward(self, seq, cache=None,train=False,context=None,context_embedding=None): + assert context is not None or context_embedding is not None, 'context or context_embedding should be provided for prefix decoder' + if context_embedding is None: + input_ids = context['input_ids'].squeeze(1) if context['input_ids'].ndim == 3 else context['input_ids'] + attention_mask = context['attention_mask'].squeeze(1) if context['attention_mask'].ndim == 3 else context['attention_mask'] + assert input_ids is not None, 'input_ids should be provided for prefix decoder' + assert attention_mask is not None, 'attention_mask should be provided for prefix decoder' + assert input_ids.device == self.text_encoder.device, 'input_ids should be on the same device as text_encoder' + + context = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + ).last_hidden_state + else: + context = context_embedding + + # concatenate context with seq + seq = torch.cat([context, seq], dim=1) # B x (T+context_length) x emb_size + if cache is not None: # implementing run_one_step in inference + if cache.hiddens is None: cache = None + hidden_vec, intermediates = self.transformer_decoder(seq, cache=cache, return_hiddens=True) + # cut to only return the seq part + return hidden_vec, intermediates + else: + if train: + hidden_vec, intermediates = self.transformer_decoder(seq, return_hiddens=True) + # cut to only return the seq part + hidden_vec = hidden_vec[:, context.shape[1]:, :] + return hidden_vec, intermediates + else: + # cut to only return the seq part + hidden_vec = self.transformer_decoder(seq) + hidden_vec = hidden_vec[:, context.shape[1]:, :] + return hidden_vec \ No newline at end of file diff --git a/SongEval/.DS_Store b/SongEval/.DS_Store new file mode 100644 index 0000000..ee3d00a Binary files /dev/null and b/SongEval/.DS_Store differ diff --git a/SongEval/LICENSE b/SongEval/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/SongEval/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/SongEval/README.md b/SongEval/README.md new file mode 100644 index 0000000..72807ae --- /dev/null +++ b/SongEval/README.md @@ -0,0 +1,88 @@ +# 🎵 SongEval: A Benchmark Dataset for Song Aesthetics Evaluation + +[![Hugging Face Dataset](https://img.shields.io/badge/HuggingFace-Dataset-blue)](https://huggingface.co/datasets/ASLP-lab/SongEval) +[![Arxiv Paper](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/pdf/2505.10793) +[![License: CC BY-NC-SA 4.0](https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by-nc-sa/4.0/) + + +This repository provides a **trained aesthetic evaluation toolkit** based on [SongEval](https://huggingface.co/datasets/ASLP-lab/SongEval), the first large-scale, open-source dataset for human-perceived song aesthetics. The toolkit enables **automatic scoring of generated song** across five perceptual aesthetic dimensions aligned with professional musician judgments. + +--- + +## 🌟 Key Features + +- 🧠 **Pretrained neural models** for perceptual aesthetic evaluation +- 🎼 Predicts **five aesthetic dimensions**: + - Overall Coherence + - Memorability + - Naturalness of Vocal Breathing and Phrasing + - Clarity of Song Structure + - Overall Musicality + +- 🎧 Accepts **full-length songs** (vocals + accompaniment) as input +- ⚙️ Simple inference interface + +--- + +## 📦 Installation + +Clone the repository and install dependencies: + +```bash +git clone https://github.com/ASLP-lab/SongEval.git +cd SongEval +pip install -r requirements.txt +``` + +## 🚀 Quick Start + +- Evaluate a single audio file: + +```bash +python eval.py -i /path/to/audio.mp3 -o /path/to/output +``` + +- Evaluate a list of audio files: + +```bash +python eval.py -i /path/to/audio_list.txt -o /path/to/output +``` + +- Evaluate all audio files in a directory: + +```bash +python eval.py -i /path/to/audio_directory -o /path/to/output +``` + +- Force evaluation on CPU (⚠️ CPU evaluation may be significantly slower) : + + +```bash +python eval.py -i /path/to/audio.wav -o /path/to/output --use_cpu True +``` + + +## 🙏 Acknowledgement +This project is mainly organized by the audio, speech and language processing lab [(ASLP@NPU)](http://www.npu-aslp.org/). + +We sincerely thank the **Shanghai Conservatory of Music** for their expert guidance on music theory, aesthetics, and annotation design. +Meanwhile, we thank AISHELL to help with the orgnization of the song annotations. + +

Shanghai Conservatory of Music Logo

+ +## 📑 License +This project is released under the CC BY-NC-SA 4.0 license. + +You are free to use, modify, and build upon it for non-commercial purposes, with attribution. + +## 📚 Citation +If you use this toolkit or the SongEval dataset, please cite the following: +``` +@article{yao2025songeval, + title = {SongEval: A Benchmark Dataset for Song Aesthetics Evaluation}, + author = {Yao, Jixun and Ma, Guobin and Xue, Huixin and Chen, Huakang and Hao, Chunbo and Jiang, Yuepeng and Liu, Haohe and Yuan, Ruibin and Xu, Jin and Xue, Wei and others}, + journal = {arXiv preprint arXiv:2505.10793}, + year={2025} +} + +``` diff --git a/SongEval/assets/logo.png b/SongEval/assets/logo.png new file mode 100644 index 0000000..d521073 Binary files /dev/null and b/SongEval/assets/logo.png differ diff --git a/SongEval/clap_score.py b/SongEval/clap_score.py new file mode 100644 index 0000000..912357a --- /dev/null +++ b/SongEval/clap_score.py @@ -0,0 +1,184 @@ +import os +import requests +from tqdm import tqdm +import torch +import numpy as np +import laion_clap +from clap_module.factory import load_state_dict +import librosa +import pyloudnorm as pyln + +# following documentation from https://github.com/LAION-AI/CLAP +def int16_to_float32(x): + return (x / 32767.0).astype(np.float32) + +def float32_to_int16(x): + x = np.clip(x, a_min=-1., a_max=1.) + return (x * 32767.).astype(np.int16) + + +def clap_score(id2text, audio_path, audio_files_extension='.wav', clap_model='music_speech_audioset_epoch_15_esc_89.98.pt'): + """ + Cosine similarity is computed between the LAION-CLAP text embedding of the given prompt and + the LAION-CLAP audio embedding of the generated audio. LION-CLAP: https://github.com/LAION-AI/CLAP + + This evaluation script assumes that audio_path files are identified with the ids in id2text. + + clap_score() evaluates all ids in id2text. + + GPU-based computation. + + Select one of the following models from https://github.com/LAION-AI/CLAP: + - music_speech_audioset_epoch_15_esc_89.98.pt (used by musicgen) + - music_audioset_epoch_15_esc_90.14.pt + - music_speech_epoch_15_esc_89.25.pt + - 630k-audioset-fusion-best.pt (our default, with "fusion" to handle longer inputs) + + Params: + -- id2text: dictionary with the mapping between id (generated audio filenames in audio_path) + and text (prompt used to generate audio). clap_score() evaluates all ids in id2text. + -- audio_path: path where the generated audio files to evaluate are available. + -- audio_files_extension: files extension (default .wav) in eval_path. + -- clap_model: choose one of the above clap_models (default: '630k-audioset-fusion-best.pt'). + Returns: + -- CLAP-LION score + """ + # load model + if clap_model == 'music_speech_audioset_epoch_15_esc_89.98.pt': + url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_audioset_epoch_15_esc_89.98.pt' + clap_path = 'load/clap_score/music_speech_audioset_epoch_15_esc_89.98.pt' + model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base', device='cuda') + elif clap_model == 'music_audioset_epoch_15_esc_90.14.pt': + url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_audioset_epoch_15_esc_90.14.pt' + clap_path = 'load/clap_score/music_audioset_epoch_15_esc_90.14.pt' + model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base', device='cuda') + elif clap_model == 'music_speech_epoch_15_esc_89.25.pt': + url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_epoch_15_esc_89.25.pt' + clap_path = 'load/clap_score/music_speech_epoch_15_esc_89.25.pt' + model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base', device='cuda') + elif clap_model == '630k-audioset-fusion-best.pt': + url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/630k-audioset-fusion-best.pt' + clap_path = 'load/clap_score/630k-audioset-fusion-best.pt' + model = laion_clap.CLAP_Module(enable_fusion=True, device='cuda') + else: + raise ValueError('clap_model not implemented') + + # download clap_model if not already downloaded + if not os.path.exists(clap_path): + print('Downloading ', clap_model, '...') + os.makedirs(os.path.dirname(clap_path), exist_ok=True) + + response = requests.get(url, stream=True) + total_size = int(response.headers.get('content-length', 0)) + + with open(clap_path, 'wb') as file: + with tqdm(total=total_size, unit='B', unit_scale=True) as progress_bar: + for data in response.iter_content(chunk_size=8192): + file.write(data) + progress_bar.update(len(data)) + + # fixing CLAP-LION issue, see: https://github.com/LAION-AI/CLAP/issues/118 + pkg = load_state_dict(clap_path) + pkg.pop('text_branch.embeddings.position_ids', None) + model.model.load_state_dict(pkg) + model.eval() + + if not os.path.isdir(audio_path): + raise ValueError('audio_path does not exist') + + if id2text: + print('[EXTRACTING TEXT EMBEDDINGS] ') + batch_size = 64 + text_emb = {} + for i in tqdm(range(0, len(id2text), batch_size)): + batch_ids = list(id2text.keys())[i:i+batch_size] + batch_texts = [id2text[id] for id in batch_ids] + with torch.no_grad(): + embeddings = model.get_text_embedding(batch_texts, use_tensor=True) + for id, emb in zip(batch_ids, embeddings): + text_emb[id] = emb + + else: + raise ValueError('Must specify id2text') + + print('[EVALUATING GENERATIONS] ', audio_path) + score = 0 + count = 0 + for id in tqdm(id2text.keys()): + file_path = os.path.join(audio_path, str(id)+audio_files_extension) + with torch.no_grad(): + audio, _ = librosa.load(file_path, sr=48000, mono=True) # sample rate should be 48000 + audio = pyln.normalize.peak(audio, -1.0) + audio = audio.reshape(1, -1) # unsqueeze (1,T) + audio = torch.from_numpy(int16_to_float32(float32_to_int16(audio))).float() + audio_embeddings = model.get_audio_embedding_from_data(x = audio, use_tensor=True) + cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_emb[id].unsqueeze(0), dim=1, eps=1e-8)[0] + score += cosine_sim + count += 1 + + return score / count if count > 0 else 0 + + +if __name__ == "__main__": + + import pandas as pd + import json + import argparse + parser = argparse.ArgumentParser(description='Compute CLAP score for generated audio files.') + parser.add_argument('--clap_model', type=str, default='630k-audioset-fusion-best.pt', + help='CLAP model to use for evaluation. Options: music_speech_audioset_epoch_15_esc_89.98.pt, music_audioset_epoch_15_esc_90.14.pt, music_speech_epoch_15_esc_89.25.pt, 630k-audioset-fusion-best.pt (default: 630k-audioset-fusion-best.pt)') + parser.add_argument('--root_path', type=str, default='../wandb/run-20250627_172105-xpe7nh5n-worseInstr/generated_samples_text_conditioned_top_p_threshold_0.99_temperature_1.15_8', + help='Path to the directory containing generated audio files and id2text mapping.') + args = parser.parse_args() + clap_model = args.clap_model + root_path = args.root_path + json_file_path = os.path.join(root_path, 'name2prompt.jsonl') + generated_path = os.path.join(root_path, 'prompt_music') + if not os.path.exists(generated_path): + generated_path =root_path # if no 'music' subfolder, use root_path directly + + with open(json_file_path, 'r') as f: + id2text_dict = {} + for line in f: + item = json.loads(line) + for k, v in item.items(): + id2text_dict[k] = v[0] + print('length of id2text:', len(id2text_dict)) + # id2text = {k+'_1': v[0] for k, v in id2text_dict.items()} # assuming each key has a list of prompts, we take the first one + id2text ={} + for k, v in id2text_dict.items(): + if isinstance(v, list): + id2text[k] = v[0] + # ckeck if k exist as wav file + if os.path.exists(os.path.join(generated_path, str(k)+'.wav')): + id2text[k] = v[0] + else: + # find k_*, k_1, k_2, ... and check if they exist + for i in range(0, 10): # assuming no more than 100 variations + if os.path.exists(os.path.join(generated_path, str(k)+'_'+str(i)+'.wav')): + new_key = str(k) + '_' + str(i) + id2text[new_key] = v[0] + print('length of id2text after checking wav files:', len(id2text)) + # check if wav exsists + new_id2text = {} + for id in id2text.keys(): + file_path = os.path.join(generated_path, str(id)+'.wav') + if os.path.exists(file_path): + new_id2text[id] = id2text[id] + else: + print(f"Warning: {file_path} does not exist, skipping this id.") + print('length of new_id2text:', len(new_id2text)) + + """ + IMPORTANT: the audios in generated_path should have the same ids as in id2text. + For musiccaps, you can load id2text as above and each generated_path audio file + corresponds to a prompt (text description) in musiccaps. Files are named with ids, as follows: + - your_model_outputs_folder/_-kssA-FOzU.wav + - your_model_outputs_folder/_0-2meOf9qY.wav + - your_model_outputs_folder/_1woPC5HWSg.wav + ... + - your_model_outputs_folder/ZzyWbehtt0M.wav + """ + + clp = clap_score(new_id2text, generated_path, audio_files_extension='.wav') + print('CLAP score (cosine similarity):', clp) \ No newline at end of file diff --git a/SongEval/config.yaml b/SongEval/config.yaml new file mode 100644 index 0000000..f30b498 --- /dev/null +++ b/SongEval/config.yaml @@ -0,0 +1,6 @@ +generator: + _target_: model.Generator + in_features: 1024 + ffd_hidden_size: 4096 + num_classes: 5 + attn_layer_num: 4 \ No newline at end of file diff --git a/SongEval/controlability.py b/SongEval/controlability.py new file mode 100644 index 0000000..1cb19aa --- /dev/null +++ b/SongEval/controlability.py @@ -0,0 +1,456 @@ +import json + +generate_path = 'Text2midi/muzic/musecoco/2-attribute2music_model/generation/0505/linear_mask-1billion-attribute2music/infer_test/topk15-t1.0-ngram0/all_midis' +# generate_path = 'Text2midi/t2m-inferalign/text2midi_infer_output' +# generate_path = 'wandb/no-disp-no-ciem/text_condi_top_p_t0.99_temp1.25' +test_set_json = "dataset/midicaps/train.json" + +generated_eval_json_path = f"{generate_path}/eval.json" +generated_name2prompt_jsonl_path = f"{generate_path}/name2prompt.jsonl" + +# 1. 读取 test_set,建立 prompt 到条目的映射 +with open(test_set_json, 'r') as f: + test_set =[] + for line in f: + if not line.strip(): + continue + item = json.loads(line.strip()) + test_set.append(item) +prompt2item = {item['caption']: item for item in test_set if item['test_set'] is True} +print(f"Number of prompts in test set: {len(prompt2item)}") +# 2. 读取 name2prompt.jsonl,建立 name 到 prompt 的映射 +name2prompt = {} +with open(generated_name2prompt_jsonl_path, 'r') as f: + for line in f: + obj = json.loads(line) + name2prompt.update({k: v[0] for k, v in obj.items() if isinstance(v, list) and len(v) > 0}) +# 3. 读取 eval.json +with open(generated_eval_json_path, 'r') as f: + eval_items = [] + for line in f: + if not line.strip(): + continue + item = json.loads(line.strip()) + eval_items.append(item) + +# 4. 对每个 name,找到对应的 prompt,确保 prompt 在 test_set 里,然后找到 eval.json 里对应的条目 +results = [] +# turn the name of eval_items into relative name +for item in eval_items: + item['name'] = item['name'].split('/')[-1] # 假设 name 是一个路径,取最后一部分作为相对名称 + # 去掉第二个下划线后面的内容 + if '_' in item['name']: + item['name'] = item['name'].split('.')[0].split('_')[0] + '_' + item['name'].split('.')[0].split('_')[1] + # print(f"Processed eval item name: {item['name']}") + +for name, prompt in name2prompt.items(): + if prompt not in prompt2item: + print(f"Prompt not found in test set: {prompt}") + continue + # 找到 eval.json 里对应的条目(假设 eval.json 里有 name 字段) + eval_entry = next((item for item in eval_items if item.get('name') == name), None) + if eval_entry is None: + print(f"Eval entry not found for name: {name}") + continue + # 原始条目 + original_entry = prompt2item[prompt] + results.append({ + 'name': name, + 'prompt': prompt, + 'eval_entry': eval_entry, + 'original_entry': original_entry + }) +print(f"Number of results: {len(results)}") +print(f"Sample result: {results[0] if results else 'No results'}") + +def calculate_TBT_score(results): + """ + • Tempo Bin with Tolerance (TBT): The predicted bpm falls into the ground truth tempo bin or +a neighboring one. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'tempo' in eval_entry and 'tempo' in original_entry: + eval_tempo = eval_entry['tempo'][0] if isinstance(eval_entry['tempo'], list) else eval_entry['tempo'] + original_tempo = original_entry['tempo'] + if original_tempo is None or eval_tempo is None: + continue # 如果原始条目没有 tempo,跳过 + # 检查 eval_tempo 是否在 original_tempo 的范围内 + if original_tempo - 10 <= eval_tempo <= original_tempo + 15: + correct += 1 + total += 1 + TB_score = correct / total if total > 0 else 0 + print(f"TB Score: {TB_score:.4f} (Correct: {correct}, Total: {total})") + return TB_score + +def calculate_CK_score(results): + """ + • Correct Key (CK): The predicted key matches the ground truth key. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'key' in eval_entry and 'key' in original_entry: + eval_key = eval_entry['key'][0] if isinstance(eval_entry['key'], list) else eval_entry['key'] + eval_key = eval_key if eval_key is not None else "C major" # 默认值为 C 大调 + original_key = original_entry['key'] if original_entry['key'] is not None else "C major" # 默认值为 C 大调 + if original_key is None or eval_key is None: + continue + if eval_key == original_key: + correct += 1 + total += 1 + CK_score = correct / total if total > 0 else 0 + print(f"CK Score: {CK_score:.4f} (Correct: {correct}, Total: {total})") + return CK_score +def calculate_CKD_score(results): + """ + Correct Key with Duplicates (CKD): The predicted key matches the ground truth key or an equivalent key (i.e., a major key and its relative minor). + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'key' in eval_entry and 'key' in original_entry: + eval_key = eval_entry['key'][0] if isinstance(eval_entry['key'], list) else eval_entry['key'] + if eval_key is None: + eval_key = "C major" # 默认值为 C 大调 + original_key = original_entry['key'] if original_entry['key'] is not None else "C major" + if original_key is None or eval_key is None: + continue # 如果原始条目没有 key,跳过 + # 检查 eval_key 是否与 original_key 相同或是其相对小调 + if eval_key == original_key or (eval_key.split(' ')[0] == original_key.split(' ')[0]): + correct += 1 + total += 1 + CKD_score = correct / total if total > 0 else 0 + print(f"CKD Score: {CKD_score:.4f} (Correct: {correct}, Total: {total})") + return CKD_score + +def calculate_CTS_score(results): + """ + • Correct Time Signature (CTS): The predicted time signature matches the ground truth time signature. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'time_signature' in eval_entry and 'time_signature' in original_entry: + eval_time_signature = eval_entry['time_signature'][0] if isinstance(eval_entry['time_signature'], list) else eval_entry['time_signature'] + original_time_signature = original_entry['time_signature'] + if original_time_signature is None or eval_time_signature is None: + continue # 如果原始条目没有 time signature,跳过 + if eval_time_signature == original_time_signature: + correct += 1 + else: + # 检查是否为相同的节拍(如 4/4 和 2/2) + eval_numerator, eval_denominator = map(int, eval_time_signature.split('/')) + original_numerator, original_denominator = map(int, original_time_signature.split('/')) + if (eval_numerator == original_numerator and eval_denominator == original_denominator) or \ + (eval_numerator * 2 == original_numerator and eval_denominator == original_denominator): + correct += 1 + total += 1 + CTS_score = correct / total if total > 0 else 0 + print(f"CTS Score: {CTS_score:.4f} (Correct: {correct}, Total: {total})") + return CTS_score + +def calculate_ECM_score(results): + """ + Exact Chord Match (ECM): The predicted + chord sequence matches the ground truth exactly + in terms of order, chord root, and chord type, with + tolerance for missing and excess chord instances. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'chord_summary' in eval_entry and 'chord_summary' in original_entry: + eval_chord_summary = eval_entry['chord_summary'][0] if isinstance(eval_entry['chord_summary'], list) else eval_entry['chord_summary'] + original_chord_summary = original_entry['chord_summary'] + if original_chord_summary is None or eval_chord_summary is None: + continue + # 检查 eval_chord_summary 是否包含 original_chord_summary,两个都是列表,每个元素是一个字符串 + if eval_chord_summary == original_chord_summary: + correct += 1 + total += 1 + ECM_score = correct / total if total > 0 else 0 + print(f"ECM Score: {ECM_score:.4f} (Correct: {correct}, Total: {total})") + return ECM_score + +def calculate_CMO_score(results): + """ + • Chord Match in any Order (CMO): The portion of predicted chord sequence matching the +ground truth chord root and type, in any order + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'chords' in eval_entry and 'chord_summary' in original_entry: + eval_chords_seq = eval_entry['chords'] + # remove the confidence score from eval_chords_seq + if isinstance(eval_chords_seq, list) and len(eval_chords_seq) > 0 and isinstance(eval_chords_seq[0], list): + eval_chords_seq = [chord[0] for chord in eval_chords_seq] + original_chord_summary = original_entry['chord_summary'] + if original_chord_summary is None or eval_chords_seq is None: + continue + # 检查 eval_chords_seq 是否包含 original_chord_summary,两个都是列表 + eval_chords_set = set(eval_chords_seq) # [['C', 0.464399092], ['G', 2.879274376]] + original_chord_set = set(original_chord_summary) # ['G', 'C'] + if original_chord_set.issubset(eval_chords_set): + correct += 1 + else: + if original_chord_set == eval_chords_set: + correct += 1 + total += 1 + CMO_score = correct / total if total > 0 else 0 + print(f"CMO Score: {CMO_score:.4f} (Correct: {correct}, Total: {total})") + return CMO_score + +def calculate_CI_score(results): + """ + •Correct Instrument (CI): The predicted instrument matches the ground truth instrument. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'mapped_instruments_summary' in eval_entry and 'instrument_summary' in original_entry: + eval_instrument = eval_entry['mapped_instruments_summary'] if isinstance(eval_entry['mapped_instruments'], list) else eval_entry['mapped_instruments'] + original_instrument = original_entry['instrument_summary'] + if original_instrument is None or eval_instrument is None: + continue + # 检查 eval_instrument 是否包含 original_instrument + if isinstance(eval_instrument, list): + eval_instrument_set = set(eval_instrument) + original_instrument_set = set(original_instrument) + if original_instrument_set.issubset(eval_instrument_set): + correct += 1 + else: + if eval_instrument == original_instrument: + correct += 1 + total += 1 + CI_score = correct / total if total > 0 else 0 + print(f"CI Score: {CI_score:.4f} (Correct: {correct}, Total: {total})") + return CI_score + +def calculate_CI_top1_score(results): + """ + •Correct Instrument Top-1 (CI_top1): The predicted instrument matches the ground truth instrument + or is one of the top 3 predicted instruments. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'mapped_instruments_summary' in eval_entry and 'instrument_summary' in original_entry: + eval_instrument = eval_entry['mapped_instruments_summary'] if isinstance(eval_entry['mapped_instruments'], list) else eval_entry['mapped_instruments'] + original_instrument = original_entry['instrument_summary'] + if original_instrument is None or eval_instrument is None: + continue + # 检查 eval_instrument 是否包含 original_instrument中的一个元素 + if isinstance(eval_instrument, list): + eval_instrument_set = set(eval_instrument) + original_instrument_set = set(original_instrument) + for inst in original_instrument_set: + if inst in eval_instrument_set: + correct += 1 + break + else: + if eval_instrument == original_instrument: + correct += 1 + total += 1 + CI_top1_score = correct / total if total > 0 else 0 + print(f"CI Top-1 Score: {CI_top1_score:.4f} (Correct: {correct}, Total: {total})") + return CI_top1_score + +def calculate_CG_score(results): + """ + • Correct Genre (CG): The predicted genre matches the ground truth genre. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'genre' in eval_entry and 'genre' in original_entry: + eval_genre = eval_entry['genre'][0] if isinstance(eval_entry['genre'], list) else eval_entry['genre'] + original_genre = original_entry['genre'] + if original_genre is None or eval_genre is None: + continue + # 检查 eval_genre 是否包含 original_genre + if isinstance(eval_genre, list): + eval_genre_set = set(eval_genre) + original_genre_set = set(original_genre) + if original_genre_set.issubset(eval_genre_set): + correct += 1 + else: + if eval_genre == original_genre: + correct += 1 + total += 1 + CG_score = correct / total if total > 0 else 0 + print(f"CG Score: {CG_score:.4f} (Correct: {correct}, Total: {total})") + return CG_score + +def calculate_CG_top1_score(results): + """ + • Correct Genre Top-1 (CG_top1): The predicted genre matches the ground truth genre or is one of the top 3 predicted genres. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'genre' in eval_entry and 'genre' in original_entry: + eval_genre = eval_entry['genre'][0] if isinstance(eval_entry['genre'], list) else eval_entry['genre'] + original_genre = original_entry['genre'] + if original_genre is None or eval_genre is None: + continue + # 检查 eval_genre 是否包含 original_genre中的一个元素 + if isinstance(eval_genre, list): + eval_genre_set = set(eval_genre) + original_genre_set = set(original_genre) + for gen in original_genre_set: + if gen in eval_genre_set: + correct += 1 + break + else: + if eval_genre == original_genre: + correct += 1 + total += 1 + CG_top1_score = correct / total if total > 0 else 0 + print(f"CG Top-1 Score: {CG_top1_score:.4f} (Correct: {correct}, Total: {total})") + return CG_top1_score + +def calculate_CM_score(results): + """ + • Correct Mood (CM): The predicted mood matches the ground truth mood. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'mood' in eval_entry and 'mood' in original_entry: + eval_mood = eval_entry['mood'][0] if isinstance(eval_entry['mood'], list) else eval_entry['mood'] + original_mood = original_entry['mood'] + if original_mood is None or eval_mood is None: + continue + # 检查 eval_mood 是否包含 original_mood + if isinstance(eval_mood, list): + eval_mood_set = set(eval_mood) + original_mood_set = set(original_mood) + if original_mood_set.issubset(eval_mood_set): + correct += 1 + else: + if eval_mood == original_mood: + correct += 1 + total += 1 + CM_score = correct / total if total > 0 else 0 + print(f"CM Score: {CM_score:.4f} (Correct: {correct}, Total: {total})") + return CM_score + +def calculate_CM_top1_score(results): + """ + • Correct Mood Top-1 (CM_top1): The predicted mood matches the ground truth mood or is one of the top 3 predicted moods. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'mood' in eval_entry and 'mood' in original_entry: + eval_mood = eval_entry['mood'][0] if isinstance(eval_entry['mood'], list) else eval_entry['mood'] + original_mood = original_entry['mood'] + if original_mood is None or eval_mood is None: + continue + # 检查 eval_mood 是否包含 original_mood中的一个元素 + if isinstance(eval_mood, list): + eval_mood_set = set(eval_mood) + original_mood_set = set(original_mood) + for mood in original_mood_set: + if mood in eval_mood_set: + correct += 1 + break + else: + if eval_mood == original_mood: + correct += 1 + total += 1 + CM_top1_score = correct / total if total > 0 else 0 + print(f"CM Top-1 Score: {CM_top1_score:.4f} (Correct: {correct}, Total: {total})") + return CM_top1_score + +def calculate_CM_top3_score(results): + """ + • Correct Mood Top-3 (CM_top3): The predicted mood matches the ground truth mood or is one of the top 3 predicted moods. + """ + correct = 0 + total = 0 + for result in results: + eval_entry = result['eval_entry'] + original_entry = result['original_entry'] + if 'mood' in eval_entry and 'mood' in original_entry: + eval_mood = eval_entry['mood'][0] if isinstance(eval_entry['mood'], list) else eval_entry['mood'] + original_mood = original_entry['mood'] + if original_mood is None or eval_mood is None: + continue + # 检查 eval_mood 是否包含 original_mood中的3个元素 + if isinstance(eval_mood, list): + eval_mood_set = set(eval_mood) + original_mood_set = set(original_mood) + if len(original_mood_set) <= 3 and original_mood_set.issubset(eval_mood_set): + correct += 1 + elif len(original_mood_set) > 3: + match_num = sum(1 for mood in original_mood_set if mood in eval_mood_set) + if match_num >= 3: + correct += 1 + else: + if eval_mood == original_mood: + correct += 1 + total += 1 + CM_top3_score = correct / total if total > 0 else 0 + print(f"CM Top-3 Score: {CM_top3_score:.4f} (Correct: {correct}, Total: {total})") + return CM_top3_score + +def calculate_all_scores(results): + """ + Calculate all scores and return them as a dictionary. + """ + scores = { + 'TBT_score': calculate_TBT_score(results), + 'CK_score': calculate_CK_score(results), + 'CKD_score': calculate_CKD_score(results), + 'CTS_score': calculate_CTS_score(results), + 'ECM_score': calculate_ECM_score(results), + 'CMO_score': calculate_CMO_score(results), + 'CI_score': calculate_CI_score(results), + 'CI_top1_score': calculate_CI_top1_score(results), + 'CG_score': calculate_CG_score(results), + 'CG_top1_score': calculate_CG_top1_score(results), + 'CM_score': calculate_CM_score(results), + 'CM_top1_score': calculate_CM_top1_score(results), + 'CM_top3_score': calculate_CM_top3_score(results) + } + return scores +if __name__ == "__main__": + scores = calculate_all_scores(results) + print("All Scores:") + for score_name, score_value in scores.items(): + print(f"{score_name}: {score_value:.4f}") + + # Save the results to a JSON file + output_file = f"{generate_path}/results.json" + with open(output_file, 'w') as f: + json.dump(scores, f, indent=4) + print(f"Results saved to {output_file}") + diff --git a/SongEval/ebr.py b/SongEval/ebr.py new file mode 100644 index 0000000..4b7a4f8 --- /dev/null +++ b/SongEval/ebr.py @@ -0,0 +1,103 @@ +import argparse +import glob +import os +import pandas as pd +import muspy +from concurrent.futures import ThreadPoolExecutor, as_completed +from tqdm import tqdm + +def compute_midi_metrics(file_path): + """计算单个MIDI文件的音乐指标""" + try: + music = muspy.read(file_path) + scale_consistency = muspy.scale_consistency(music) + pitch_entropy = muspy.pitch_entropy(music) + pitch_class_entropy = muspy.pitch_class_entropy(music) + empty_beat_rate = muspy.empty_beat_rate(music) + groove_consistency = muspy.groove_consistency(music, 12) + metrics = { + 'scale_consistency': scale_consistency, + 'pitch_entropy': pitch_entropy, + 'pitch_class_entropy': pitch_class_entropy, + 'empty_beat_rate': empty_beat_rate, + 'groove_consistency': groove_consistency, + 'filename': os.path.basename(file_path) + } + return metrics + except Exception as e: + print(f"处理文件 {os.path.basename(file_path)} 时出错: {str(e)}") + return None + +def compute_directory_metrics(directory_path, num_workers=8): + """计算目录下所有MIDI文件的音乐指标(多线程加速)""" + midi_files = [] + for root, _, files in os.walk(directory_path): + for file in files: + if file.lower().endswith(('.mid', '.midi')): + midi_files.append(os.path.join(root, file)) + if not midi_files: + print("目录及子文件夹中未找到MIDI文件") + return None + + all_metrics = [] + average_metrics = { + 'scale_consistency': 0, + 'pitch_entropy': 0, + 'pitch_class_entropy': 0, + 'empty_beat_rate': 0, + 'groove_consistency': 0 + } + current_num = 0 + total_scale_consistency = 0 + total_pitch_entropy = 0 + total_pitch_class_entropy = 0 + total_empty_beat_rate = 0 + total_groove_consistency = 0 + print(f"正在处理目录: {directory_path}") + print(f"发现 {len(midi_files)} 个MIDI文件:") + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = {executor.submit(compute_midi_metrics, midi_file): midi_file for midi_file in midi_files} + for future in tqdm(as_completed(futures), total=len(midi_files), desc="处理中"): + metrics = future.result() + + if metrics is not None: + current_num += 1 + total_scale_consistency += metrics['scale_consistency'] + total_pitch_entropy += metrics['pitch_entropy'] + total_pitch_class_entropy += metrics['pitch_class_entropy'] + total_empty_beat_rate += metrics['empty_beat_rate'] + total_groove_consistency += metrics['groove_consistency'] + average_metrics['scale_consistency'] = total_scale_consistency / current_num + average_metrics['pitch_entropy'] = total_pitch_entropy / current_num + average_metrics['pitch_class_entropy'] = total_pitch_class_entropy / current_num + average_metrics['empty_beat_rate'] = total_empty_beat_rate / current_num + average_metrics['groove_consistency'] = total_groove_consistency / current_num + print("current_metrics:", metrics) + + all_metrics.append(metrics) + + if not all_metrics: + print("所有文件处理失败") + return None + + df = pd.DataFrame(all_metrics) + output_csv = os.path.join(directory_path, "midi_metrics_report.csv") + df.to_csv(output_csv, index=False) + avg_metrics = df.mean(numeric_only=True) + return df, avg_metrics + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="计算目录下所有MIDI文件的音乐指标") + parser.add_argument("path", type=str, help="包含MIDI文件的目录路径") + parser.add_argument("--threads", type=int, default=1, help="线程数(默认8)") + args = parser.parse_args() + + if not os.path.isdir(args.path): + print(f"错误: 路径 '{args.path}' 不存在或不是目录") + else: + result, averages = compute_directory_metrics(args.path, num_workers=args.threads) + if result is not None: + print("\n计算完成! 结果已保存到 midi_metrics_report.csv") + print("\n平均指标值:") + print(averages.to_string()) diff --git a/SongEval/eval.py b/SongEval/eval.py new file mode 100644 index 0000000..0d1c0d3 --- /dev/null +++ b/SongEval/eval.py @@ -0,0 +1,150 @@ +import glob +import os +import json +import librosa +import numpy as np +import torch +import argparse +from muq import MuQ +from hydra.utils import instantiate +from omegaconf import OmegaConf +from safetensors.torch import load_file +from tqdm import tqdm + + + +class Synthesizer(object): + + def __init__(self, + checkpoint_path, + input_path, + output_dir, + use_cpu: bool = False): + + self.checkpoint_path = checkpoint_path + self.input_path = input_path + self.output_dir = output_dir + os.makedirs(self.output_dir, exist_ok=True) + self.device = torch.device('cuda') if (torch.cuda.is_available() and (not use_cpu)) else torch.device('cpu') + + @torch.no_grad() + def setup(self): + + train_config = OmegaConf.load(os.path.join(os.path.dirname(self.checkpoint_path), '../config.yaml')) + model = instantiate(train_config.generator).to(self.device).eval() + state_dict = load_file(self.checkpoint_path, device="cpu") + model.load_state_dict(state_dict, strict=False) + + self.model = model + self.muq = MuQ.from_pretrained("OpenMuQ/MuQ-large-msd-iter") + self.muq = self.muq.to(self.device).eval() + self.result_dcit = {} + + @torch.no_grad() + def synthesis(self): + if os.path.isfile(self.input_path): + if self.input_path.endswith(('.wav', '.mp3')): + lines = [] + lines.append(self.input_path) + else: + with open(self.input_path, "r") as f: + lines = [line for line in f] + input_files = [{ + "input_path": line.strip(), + } for line in lines] + print(f"input filelst: {self.input_path}") + elif os.path.isdir(self.input_path): + input_files = [{ + "input_path": file, + }for file in glob.glob(os.path.join(self.input_path, '*')) if file.lower().endswith(('.wav', '.mp3'))] + else: + raise ValueError(f"input_path {self.input_path} is not a file or directory") + + + for input in tqdm(input_files): + try: + self.handle(**input) + except Exception as e: + print(e) + continue + # add average + avg_values = {} + for key in self.result_dcit[list(self.result_dcit.keys())[0]].keys(): + avg_values[key] = round(np.mean([self.result_dcit[fid][key] for fid in self.result_dcit]), 4) + self.result_dcit['average'] = avg_values + # save result + with open(os.path.join(self.output_dir, "result.json") , "w")as f: + json.dump(self.result_dcit, f, indent=4, ensure_ascii=False) + + @torch.no_grad() + def handle(self, input_path): + + fid = os.path.basename(input_path).split('.')[0] + if input_path.endswith('.npy'): + input = np.load(input_path) + + # check ssl + if len(input.shape) == 3 and input.shape[0] != 1: + print('ssl_shape error', input_path) + return + if np.isnan(input).any(): + print('ssl nan', input_path) + return + + input = torch.from_numpy(input).to(self.device) + if len(input.shape) == 2: + input = input.unsqueeze(0) + + if input_path.endswith(('.wav', '.mp3')): + wav, sr = librosa.load(input_path, sr=24000) + audio = torch.tensor(wav).unsqueeze(0).to(self.device) + output = self.muq(audio, output_hidden_states=True) + input = output["hidden_states"][6] + + values = {} + scores_g = self.model(input).squeeze(0) + values['Coherence'] = round(scores_g[0].item(), 4) + values['Musicality'] = round(scores_g[1].item(), 4) + values['Memorability'] = round(scores_g[2].item(), 4) + values['Clarity'] = round(scores_g[3].item(), 4) + values['Naturalness'] = round(scores_g[4].item(), 4) + + + self.result_dcit[fid] = values + # delete + del input, output, scores_g, values,audio, wav, sr + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", "--input_path", + type=str, + required=True, + help="Input audio: path to a single file, a text file listing audio paths, or a directory of audio files." + ) + parser.add_argument( + "-o", "--output_dir", + type=str, + required=True, + help="Output directory for generated results (will be created if it doesn't exist)." + ) + parser.add_argument( + "--use_cpu", + type=str, + help="Force CPU mode even if a GPU is available.", + default=False + ) + + args = parser.parse_args() + + ckpt_path = "ckpt/model.safetensors" + + synthesizer = Synthesizer(checkpoint_path=ckpt_path, + input_path=args.input_path, + output_dir=args.output_dir, + use_cpu=args.use_cpu) + + synthesizer.setup() + + synthesizer.synthesis() \ No newline at end of file diff --git a/SongEval/generate-batch_easy.py b/SongEval/generate-batch_easy.py new file mode 100644 index 0000000..05e5c38 --- /dev/null +++ b/SongEval/generate-batch_easy.py @@ -0,0 +1,404 @@ +import sys +import os +from pathlib import Path +from multiprocessing import Process,set_start_method +import torch +import argparse +from omegaconf import OmegaConf +import json +from collections import defaultdict + +from Amadeus.evaluation_utils import ( + wandb_style_config_to_omega_config, + prepare_model_and_dataset_from_config, + get_best_ckpt_path_and_config, + Evaluator +) +from transformers import T5Tokenizer, T5EncoderModel + +from Amadeus import model_zoo +from Amadeus.symbolic_encoding import data_utils +from Amadeus.model_zoo import AmadeusModel +from Amadeus.symbolic_encoding.data_utils import TuneCompiler +from Amadeus.symbolic_encoding.compile_utils import shift_and_pad +from Amadeus.symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor +from Amadeus.symbolic_encoding import decoding_utils +from Amadeus.train_utils import adjust_prediction_order +from data_representation import vocab_utils +from data_representation.vocab_utils import LangTokenVocab + + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-wandb_exp_dir", + required=True, + type=str, + help="wandb experiment directory", + ) + parser.add_argument( + "-generation_type", + type=str, + choices=('conditioned', 'unconditioned', 'text-conditioned'), + default='unconditioned', + help="generation type", + ) + parser.add_argument( + "-sampling_method", + type=str, + choices=('top_p', 'top_k'), + default='top_p', + help="sampling method", + ) + parser.add_argument( + "-threshold", + type=float, + default=0.99, + help="threshold", + ) + parser.add_argument( + "-temperature", + type=float, + default=1.15, + help="temperature", + ) + parser.add_argument( + "-num_samples", + type=int, + default=30, + help="number of samples to generate", + ) + parser.add_argument( + "-num_target_measure", + type=int, + default=4, + help="number of target measures for conditioned generation", + ) + parser.add_argument( + "-choose_selected_tunes", + action='store_true', + help="generate samples from selected tunes, only for SOD dataset", + ) + parser.add_argument( + "-generate_length", + type=int, + default=1024, + help="length of the generated sequence", + ) + parser.add_argument( + "-num_processes", + type=int, + default=2, + help="number of processes to use", + ) + parser.add_argument( + "-gpu_ids", + type=str, + default="0,5", + help="comma-separated list of GPU IDs to use (e.g., '0,1,2,3')", + ) + parser.add_argument( + "-prompt", + type=str, + default="With a rhythm of 100 BPM, this classical piece in 1/4 time signature in the key of Eb major creates a classical mood using String Ensemble, Pizzicato Strings, Tremolo Strings, Trumpet, Timpani.", + help="prompt for generation, only used for conditioned generation", + ) + parser.add_argument( + "-prompt_file", + type=str, + default="dataset/midicaps/train.json", + help="file containing prompts for text-conditioned generation", + ) + return parser + +def load_resources(wandb_exp_dir, device): + """Load model and dataset resources for a process""" + wandb_dir = Path('wandb') + ckpt_path, config_path, metadata_path, vocab_path = get_best_ckpt_path_and_config(wandb_dir, wandb_exp_dir) + config = OmegaConf.load(config_path) + config = wandb_style_config_to_omega_config(config) + + # Load checkpoint to specified device + ckpt = torch.load(ckpt_path, map_location=device) + model, test_set, vocab = prepare_model_and_dataset_from_config(config, metadata_path, vocab_path) + model.load_state_dict(ckpt['model'], strict=False) + model.to(device) + model.eval() + torch.compile(model) + print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad)) + + # Prepare dataset for prompts + condition_list = [x[1] for x in test_set.data_list] + dataset_for_prompt = [] + for i in range(len(condition_list)): + condition = test_set.get_segments_with_tune_idx(condition_list[i], 0)[0] + dataset_for_prompt.append((condition, condition_list[i])) + + return config, model, dataset_for_prompt, vocab + +def conditioned_worker(process_idx, gpu_id, args, data_slice): + """Worker process for conditioned generation""" + torch.cuda.set_device(gpu_id) + device = torch.device(f'cuda:{gpu_id}') + + # Load resources with proper device + config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device) + + # Create output directory with process index + base_path = Path('wandb') / args.wandb_exp_dir / \ + f"cond_{args.num_target_measure}m_{args.sampling_method}_t{args.threshold}_temp{args.temperature}" + base_path.mkdir(parents=True, exist_ok=True) + + evaluator = Evaluator(config, model, dataset_for_prompt, vocab, device=device) + + # Process assigned data slice + for idx, (tune_in_idx, tune_name) in enumerate(data_slice): + batch_dir = base_path / f"process_{process_idx}_batch_{idx}" + batch_dir.mkdir(parents=True, exist_ok=True) + evaluator.generate_samples_with_prompt( + batch_dir, + args.num_target_measure, + tune_in_idx, + tune_name, + config.data_params.first_pred_feature, + args.sampling_method, + args.threshold, + args.temperature, + generation_length=args.generate_length + ) +def generate_samples_unconditioned(config, vocab, model, device,save_dir, num_samples, first_pred_feature, sampling_method, threshold, temperature, generation_length=3072,uid=1): + encoding_scheme = config.nn_params.encoding_scheme + + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + try: + in_beat_resolution = in_beat_resolution_dict[config.dataset] + except KeyError: + in_beat_resolution = 4 # Default resolution if dataset is not found + midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)(vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset) + + for i in range(num_samples): + generated_sample = model.generate(0, generation_length, condition=None, num_target_measures=None, sampling_method=sampling_method, threshold=threshold, temperature=temperature) + if encoding_scheme == 'nb': + generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature) + decoder(generated_sample, output_path=str(save_dir / f"{uid}_{i}.mid")) + +def generate_samples_with_text_prompt(config, vocab, model, device, save_dir, prompt, first_pred_feature, sampling_method, threshold, temperature, generation_length=3072,uid=1): + encoding_scheme = config.nn_params.encoding_scheme + tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large') + encoder = T5EncoderModel.from_pretrained('google/flan-t5-large').to(device) + print(f"Using T5EncoderModel for text prompt: {prompt}") + context = tokenizer(prompt, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(device) + context = encoder(**context).last_hidden_state + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + try: + in_beat_resolution = in_beat_resolution_dict[config.dataset] + except KeyError: + in_beat_resolution = 4 # Default resolution if dataset is not found + midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)(vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset) + + generated_sample = model.generate(0, generation_length, condition=None, num_target_measures=None, sampling_method=sampling_method, threshold=threshold, temperature=temperature, context=context) + if encoding_scheme == 'nb': + generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature) + # Open the jsonl file and count the number of lines to determine the current index + jsonl_path = save_dir / "name2prompt.jsonl" + if jsonl_path.exists(): + with open(jsonl_path, 'r') as f: + current_idx = sum(1 for _ in f) + else: + current_idx = 0 + + name = f"prompt_{current_idx}" + name2prompt_dict = defaultdict(list) + name2prompt_dict[name].append(prompt) + with open(jsonl_path, 'a') as f: + f.write(json.dumps(name2prompt_dict) + '\n') + decoder(generated_sample, output_path=str(save_dir / f"{name}_{uid}.mid")) + +def unconditioned_worker(process_idx, gpu_id, args, num_samples): + """Worker process for unconditioned generation""" + torch.cuda.set_device(gpu_id) + device = torch.device(f'cuda:{gpu_id}') + + # Load resources with proper device + config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device) + + # Create output directory with process index + base_path = Path('wandb') / args.wandb_exp_dir / \ + f"uncond_{args.sampling_method}_t{args.threshold}_temp{args.temperature}" + base_path.mkdir(parents=True, exist_ok=True) + + # Generate assigned number of samples + batch_dir = base_path + generate_samples_unconditioned( + config, + vocab, + model, + batch_dir, + num_samples, + config.data_params.first_pred_feature, + args.sampling_method, + args.threshold, + args.temperature, + generation_length=args.generate_length, + uid=f"{process_idx}" + ) +def text_conditioned_worker(process_idx, gpu_id, args, num_samples, data_slice): + """Worker process for unconditioned generation""" + torch.cuda.set_device(gpu_id) + device = torch.device(f'cuda:{gpu_id}') + + # Load resources with proper device + config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device) + + # Create output directory with process index + base_path = Path('wandb') / args.wandb_exp_dir / \ + f"text_condi_{args.sampling_method}_t{args.threshold}_temp{args.temperature}" + base_path.mkdir(parents=True, exist_ok=True) + + # Generate assigned number of samples + batch_dir = base_path + for idx, tune_name in enumerate(data_slice): + print(f"Process {process_idx} generating samples for tune: {tune_name}") + generate_samples_with_text_prompt( + config, + vocab, + model, + device, + batch_dir, + prompt=tune_name, + first_pred_feature=config.data_params.first_pred_feature, + sampling_method=args.sampling_method, + threshold=args.threshold, + temperature=args.temperature, + generation_length=args.generate_length, + uid=f"{process_idx}_{idx}" + ) +def main(): + # use spawn method for multiprocessing + set_start_method('spawn', force=True) + args = get_argument_parser().parse_args() + gpu_ids = list(map(int, args.gpu_ids.split(','))) + + # Validate GPU availability + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available") + if len(gpu_ids) == 0: + raise ValueError("At least one GPU must be specified") + + # Validate process count + if args.num_processes < 1: + raise ValueError("Number of processes must be at least 1") + if len(gpu_ids) < args.num_processes: + print(f"Warning: More processes ({args.num_processes}) than GPUs ({len(gpu_ids)}), some GPUs will be shared") + + # Prepare data slices for processes + processes = [] + try: + if args.generation_type == 'conditioned': + # Prepare selected tunes + wandb_dir = Path('wandb') / args.wandb_exp_dir + if not wandb_dir.exists(): + raise FileNotFoundError(f"Experiment {args.wandb_exp_dir} not found") + + # Load test set to get selected tunes (dummy load to get dataset info) + dummy_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + _, test_set, _ = prepare_model_and_dataset_from_config( + wandb_dir / "files" / "config.yaml", + wandb_dir / "files" / "metadata.json", + wandb_dir / "files" / "vocab.json" + ) + + if args.choose_selected_tunes and test_set.dataset == 'SOD': + selected_tunes = ['Requiem_orch', 'magnificat_bwv-243_8_orch', + "Clarinet Concert in A Major: 2nd Movement, Adagio_orch"] + else: + selected_tunes = [name for _, name in test_set.data_list][:args.num_samples] + + # Split selected data across processes + selected_data = [d for d in test_set.data_list if d[1] in selected_tunes] + chunk_size = (len(selected_data) + args.num_processes - 1) // args.num_processes + + for i in range(args.num_processes): + start_idx = i * chunk_size + end_idx = min((i+1)*chunk_size, len(selected_data)) + data_slice = selected_data[start_idx:end_idx] + + if not data_slice: + continue + + gpu_id = gpu_ids[i % len(gpu_ids)] + p = Process( + target=conditioned_worker, + args=(i, gpu_id, args, data_slice) + ) + processes.append(p) + p.start() + + elif args.generation_type == 'unconditioned': + samples_per_proc = args.num_samples // args.num_processes + remainder = args.num_samples % args.num_processes + + for i in range(args.num_processes): + gpu_id = gpu_ids[i % len(gpu_ids)] + samples = samples_per_proc + (1 if i < remainder else 0) + + if samples <= 0: + continue + + p = Process( + target=unconditioned_worker, + args=(i, gpu_id, args, samples) + ) + processes.append(p) + p.start() + elif args.generation_type == 'text-conditioned': + samples_per_proc = args.num_samples // args.num_processes + remainder = args.num_samples % args.num_processes + # Load prompts from file + prompt_name_list = [] + with open(args.prompt_file, 'r') as f: + for line in f: + if not line.strip(): + continue + prompt_data = json.loads(line.strip()) + prompt_text = prompt_data['caption'] + if prompt_data['test_set'] is True: + prompt_name_list.append(prompt_text) + print("length of prompt_name_list:", len(prompt_name_list)) + if len(prompt_name_list) >= args.num_samples: + print(f"Reached the limit of {args.num_samples} prompts.") + break + for i in range(args.num_processes): + gpu_id = gpu_ids[i % len(gpu_ids)] + samples = samples_per_proc + (1 if i < remainder else 0) + + if samples <= 0: + continue + + # Split prompt names across processes + start_idx = i * (len(prompt_name_list) // args.num_processes) + end_idx = (i + 1) * (len(prompt_name_list) // args.num_processes) + data_slice = prompt_name_list[start_idx:end_idx] + + p = Process( + target=text_conditioned_worker, + args=(i, gpu_id, args, samples, data_slice) + ) + processes.append(p) + p.start() + # Wait for all processes to complete + for p in processes: + p.join() + + except Exception as e: + print(f"Error in main process: {str(e)}") + for p in processes: + p.terminate() + raise + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SongEval/matrics.py b/SongEval/matrics.py new file mode 100644 index 0000000..80de842 --- /dev/null +++ b/SongEval/matrics.py @@ -0,0 +1,68 @@ +import argparse +import os +import shutil +import tempfile +import numpy as np +import torch +from audioldm_eval import EvaluationHelper, EvaluationHelperParallel +import torch.multiprocessing as mp + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--generation_path", type=str, required=True, help="Path to generated audio files") + parser.add_argument("--target_path", type=str, required=True, help="Path to reference audio files") + parser.add_argument("--force_paired", action="store_true", help="Force pairing by randomly selecting reference files") + parser.add_argument("--gpu_mode", choices=["single", "multi"], default="single", help="Evaluation mode") + parser.add_argument("--num_gpus", type=int, default=2, help="Number of GPUs for multi-GPU mode") + args = parser.parse_args() + + # Handle forced pairing + target_eval_path = args.target_path + temp_dir = None + if args.force_paired: + print(f"Using forced pairing with reference files from {args.target_path}") + temp_dir = tempfile.mkdtemp() + target_eval_path = temp_dir + + # Collect generated filenames + gen_files = [] + for root, _, files in os.walk(args.generation_path): + for file in files: + if file.endswith(".wav"): + gen_files.append(file) + print(f"Found {len(gen_files)} generated files in {args.generation_path}") + # Collect all reference files + ref_files = [] + for root, _, files in os.walk(args.target_path): + for file in files: + if file.endswith(".wav"): + ref_files.append(os.path.join(root, file)) + + # Select random references matching the count + selected_refs = np.random.choice(ref_files, len(gen_files), replace=False) + print(f"Selected {len(selected_refs)} reference files for evaluation.") + # Copy selected references to temp dir with generated filenames + for gen_file, ref_path in zip(gen_files, selected_refs): + shutil.copy(ref_path, os.path.join(temp_dir, gen_file)) + + + device = torch.device(f"cuda:{0}") if args.gpu_mode == "single" else None + + try: + if args.gpu_mode == "single": + print("Running single GPU evaluation...") + evaluator = EvaluationHelper(16000, device) + metrics = evaluator.main(args.generation_path, target_eval_path) + else: + print(f"Running multi-GPU evaluation on {args.num_gpus} GPUs...") + evaluator = EvaluationHelperParallel(16000, args.num_gpus) + metrics = evaluator.main(args.generation_path, target_eval_path) + print("Evaluation completed.") + + finally: + # Clean up temporary directory + if temp_dir and os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/SongEval/model.py b/SongEval/model.py new file mode 100644 index 0000000..7dc8713 --- /dev/null +++ b/SongEval/model.py @@ -0,0 +1,66 @@ +from einops import rearrange +import numpy as np +import torch +import torch.nn as nn + + +class Generator(nn.Module): + + def __init__(self, + in_features, + ffd_hidden_size, + num_classes, + attn_layer_num, + + ): + super(Generator, self).__init__() + + self.attn = nn.ModuleList( + [ + nn.MultiheadAttention( + embed_dim=in_features, + num_heads=8, + dropout=0.2, + batch_first=True, + ) + for _ in range(attn_layer_num) + ] + ) + + self.ffd = nn.Sequential( + nn.Linear(in_features, ffd_hidden_size), + nn.ReLU(), + nn.Linear(ffd_hidden_size, in_features) + ) + + self.dropout = nn.Dropout(0.2) + + self.fc = nn.Linear(in_features * 2, num_classes) + + self.proj = nn.Tanh() + + + def forward(self, ssl_feature, judge_id=None): + ''' + ssl_feature: [B, T, D] + output: [B, num_classes] + ''' + + B, T, D = ssl_feature.shape + + ssl_feature = self.ffd(ssl_feature) + + tmp_ssl_feature = ssl_feature + + for attn in self.attn: + tmp_ssl_feature, _ = attn(tmp_ssl_feature, tmp_ssl_feature, tmp_ssl_feature) + + ssl_feature = self.dropout(torch.concat([torch.mean(tmp_ssl_feature, dim=1), torch.max(ssl_feature, dim=1)[0]], dim=1)) # B, 2D + + x = self.fc(ssl_feature) # B, num_classes + + x = self.proj(x) * 2.0 + 3 + + return x + + diff --git a/SongEval/requirements.txt b/SongEval/requirements.txt new file mode 100644 index 0000000..b7e5e7d --- /dev/null +++ b/SongEval/requirements.txt @@ -0,0 +1,4 @@ +librosa==0.11.0 +torch==2.7.0 +muq==0.1.0 +hydra-core==1.3.2 \ No newline at end of file diff --git a/SongEval/result.json b/SongEval/result.json new file mode 100644 index 0000000..6b6b1a9 --- /dev/null +++ b/SongEval/result.json @@ -0,0 +1,3397 @@ +{ + "3_95": { + "Coherence": 2.0777, + "Musicality": 2.0003, + "Memorability": 1.9263, + "Clarity": 2.0167, + "Naturalness": 1.9908 + }, + "1_67": { + "Coherence": 2.2713, + "Musicality": 2.1316, + "Memorability": 2.0851, + "Clarity": 2.011, + "Naturalness": 2.0983 + }, + "1_71": { + "Coherence": 1.9365, + "Musicality": 1.7492, + "Memorability": 1.8149, + "Clarity": 1.7948, + "Naturalness": 1.8447 + }, + "3_11": { + "Coherence": 2.2496, + "Musicality": 1.8815, + "Memorability": 2.0675, + "Clarity": 2.0203, + "Naturalness": 1.9953 + }, + "0_25": { + "Coherence": 2.4701, + "Musicality": 2.4306, + "Memorability": 2.3398, + "Clarity": 2.3215, + "Naturalness": 2.2693 + }, + "3_28": { + "Coherence": 2.1966, + "Musicality": 2.0181, + "Memorability": 2.0707, + "Clarity": 2.0365, + "Naturalness": 2.0673 + }, + "0_13": { + "Coherence": 1.8047, + "Musicality": 1.6372, + "Memorability": 1.6814, + "Clarity": 1.6422, + "Naturalness": 1.6898 + }, + "4_56": { + "Coherence": 2.0725, + "Musicality": 2.0063, + "Memorability": 2.0271, + "Clarity": 2.0444, + "Naturalness": 2.0402 + }, + "1_77": { + "Coherence": 2.0153, + "Musicality": 2.0462, + "Memorability": 1.9804, + "Clarity": 1.9786, + "Naturalness": 2.024 + }, + "2_75": { + "Coherence": 2.6489, + "Musicality": 2.4531, + "Memorability": 2.4532, + "Clarity": 2.4901, + "Naturalness": 2.3846 + }, + "3_16": { + "Coherence": 2.1249, + "Musicality": 1.9554, + "Memorability": 1.9534, + "Clarity": 1.935, + "Naturalness": 1.9855 + }, + "1_41": { + "Coherence": 2.4529, + "Musicality": 2.5158, + "Memorability": 2.3429, + "Clarity": 2.4205, + "Naturalness": 2.377 + }, + "3_76": { + "Coherence": 2.6071, + "Musicality": 2.5502, + "Memorability": 2.5093, + "Clarity": 2.3261, + "Naturalness": 2.3095 + }, + "3_25": { + "Coherence": 2.5939, + "Musicality": 2.7286, + "Memorability": 2.6561, + "Clarity": 2.4598, + "Naturalness": 2.4987 + }, + "0_63": { + "Coherence": 2.9322, + "Musicality": 2.7272, + "Memorability": 2.7328, + "Clarity": 2.7141, + "Naturalness": 2.8544 + }, + "3_97": { + "Coherence": 2.2356, + "Musicality": 2.1371, + "Memorability": 2.0707, + "Clarity": 2.1362, + "Naturalness": 2.2475 + }, + "3_67": { + "Coherence": 2.0389, + "Musicality": 1.9455, + "Memorability": 2.0118, + "Clarity": 1.82, + "Naturalness": 1.9762 + }, + "4_38": { + "Coherence": 2.4616, + "Musicality": 2.3333, + "Memorability": 2.173, + "Clarity": 2.4206, + "Naturalness": 2.2885 + }, + "1_5": { + "Coherence": 1.757, + "Musicality": 1.7005, + "Memorability": 1.8469, + "Clarity": 1.7476, + "Naturalness": 1.8144 + }, + "0_98": { + "Coherence": 2.539, + "Musicality": 2.4862, + "Memorability": 2.5355, + "Clarity": 2.4826, + "Naturalness": 2.5772 + }, + "4_6": { + "Coherence": 2.5418, + "Musicality": 2.3204, + "Memorability": 2.4124, + "Clarity": 2.2771, + "Naturalness": 2.3653 + }, + "1_75": { + "Coherence": 2.0407, + "Musicality": 2.0109, + "Memorability": 1.9676, + "Clarity": 1.917, + "Naturalness": 1.8748 + }, + "3_89": { + "Coherence": 2.3063, + "Musicality": 2.1369, + "Memorability": 2.1295, + "Clarity": 2.226, + "Naturalness": 2.2245 + }, + "2_54": { + "Coherence": 1.98, + "Musicality": 1.8938, + "Memorability": 1.8962, + "Clarity": 1.9052, + "Naturalness": 1.8847 + }, + "0_18": { + "Coherence": 2.068, + "Musicality": 1.9906, + "Memorability": 1.9896, + "Clarity": 1.9079, + "Naturalness": 1.9479 + }, + "2_88": { + "Coherence": 2.4045, + "Musicality": 2.2651, + "Memorability": 2.2801, + "Clarity": 2.2443, + "Naturalness": 2.1988 + }, + "1_23": { + "Coherence": 2.2784, + "Musicality": 2.1312, + "Memorability": 2.1006, + "Clarity": 2.2199, + "Naturalness": 2.2498 + }, + "4_92": { + "Coherence": 2.3807, + "Musicality": 2.1529, + "Memorability": 2.1955, + "Clarity": 2.0735, + "Naturalness": 2.1025 + }, + "0_49": { + "Coherence": 1.8054, + "Musicality": 1.6688, + "Memorability": 1.7004, + "Clarity": 1.6797, + "Naturalness": 1.7418 + }, + "2_90": { + "Coherence": 1.8965, + "Musicality": 1.7718, + "Memorability": 1.7087, + "Clarity": 1.743, + "Naturalness": 1.7686 + }, + "3_1": { + "Coherence": 2.5091, + "Musicality": 2.3787, + "Memorability": 2.309, + "Clarity": 2.3365, + "Naturalness": 2.5147 + }, + "2_50": { + "Coherence": 2.1869, + "Musicality": 2.3706, + "Memorability": 2.1068, + "Clarity": 2.0057, + "Naturalness": 2.1438 + }, + "3_70": { + "Coherence": 2.3004, + "Musicality": 2.3632, + "Memorability": 2.1409, + "Clarity": 2.125, + "Naturalness": 2.1482 + }, + "0_37": { + "Coherence": 2.5766, + "Musicality": 2.5612, + "Memorability": 2.3669, + "Clarity": 2.3994, + "Naturalness": 2.3825 + }, + "1_12": { + "Coherence": 1.1409, + "Musicality": 1.1304, + "Memorability": 1.1616, + "Clarity": 1.122, + "Naturalness": 1.1375 + }, + "0_86": { + "Coherence": 2.5979, + "Musicality": 2.5692, + "Memorability": 2.3593, + "Clarity": 2.4961, + "Naturalness": 2.5166 + }, + "4_99": { + "Coherence": 2.4895, + "Musicality": 2.391, + "Memorability": 2.3179, + "Clarity": 2.4036, + "Naturalness": 2.3048 + }, + "2_77": { + "Coherence": 2.1994, + "Musicality": 2.0788, + "Memorability": 2.0804, + "Clarity": 1.8985, + "Naturalness": 1.9426 + }, + "2_3": { + "Coherence": 1.8327, + "Musicality": 1.878, + "Memorability": 1.727, + "Clarity": 1.8213, + "Naturalness": 1.7748 + }, + "0_62": { + "Coherence": 1.7433, + "Musicality": 1.6047, + "Memorability": 1.7248, + "Clarity": 1.7229, + "Naturalness": 1.7643 + }, + "2_42": { + "Coherence": 2.5783, + "Musicality": 2.4933, + "Memorability": 2.352, + "Clarity": 2.3632, + "Naturalness": 2.5246 + }, + "0_60": { + "Coherence": 2.4633, + "Musicality": 2.2678, + "Memorability": 2.3175, + "Clarity": 2.3106, + "Naturalness": 2.3029 + }, + "3_99": { + "Coherence": 2.3416, + "Musicality": 2.2106, + "Memorability": 1.9726, + "Clarity": 2.0669, + "Naturalness": 2.3347 + }, + "3_80": { + "Coherence": 2.2788, + "Musicality": 2.3309, + "Memorability": 2.1834, + "Clarity": 2.2162, + "Naturalness": 2.2746 + }, + "2_58": { + "Coherence": 2.2654, + "Musicality": 2.1978, + "Memorability": 1.9416, + "Clarity": 2.1375, + "Naturalness": 2.0765 + }, + "0_46": { + "Coherence": 2.014, + "Musicality": 2.0622, + "Memorability": 2.0347, + "Clarity": 1.947, + "Naturalness": 2.0524 + }, + "1_38": { + "Coherence": 2.3565, + "Musicality": 2.2671, + "Memorability": 2.2796, + "Clarity": 2.1579, + "Naturalness": 2.1791 + }, + "1_98": { + "Coherence": 2.2469, + "Musicality": 2.1249, + "Memorability": 1.9923, + "Clarity": 2.0832, + "Naturalness": 2.1617 + }, + "0_5": { + "Coherence": 1.9986, + "Musicality": 1.8451, + "Memorability": 1.8903, + "Clarity": 1.8648, + "Naturalness": 2.0043 + }, + "0_61": { + "Coherence": 2.0186, + "Musicality": 1.9662, + "Memorability": 1.7885, + "Clarity": 1.8509, + "Naturalness": 1.8734 + }, + "4_68": { + "Coherence": 1.3889, + "Musicality": 1.4006, + "Memorability": 1.4449, + "Clarity": 1.3977, + "Naturalness": 1.3989 + }, + "3_52": { + "Coherence": 2.2957, + "Musicality": 2.302, + "Memorability": 2.3302, + "Clarity": 2.1986, + "Naturalness": 2.3071 + }, + "0_30": { + "Coherence": 2.4437, + "Musicality": 2.3724, + "Memorability": 2.2421, + "Clarity": 2.2268, + "Naturalness": 2.3019 + }, + "4_65": { + "Coherence": 2.0702, + "Musicality": 1.8917, + "Memorability": 1.9231, + "Clarity": 1.8066, + "Naturalness": 1.8056 + }, + "4_78": { + "Coherence": 2.5228, + "Musicality": 2.5301, + "Memorability": 2.2555, + "Clarity": 2.3036, + "Naturalness": 2.3016 + }, + "3_41": { + "Coherence": 2.3243, + "Musicality": 2.0625, + "Memorability": 2.2123, + "Clarity": 2.013, + "Naturalness": 2.1384 + }, + "3_4": { + "Coherence": 2.1711, + "Musicality": 2.1947, + "Memorability": 2.0476, + "Clarity": 2.0618, + "Naturalness": 2.1072 + }, + "0_34": { + "Coherence": 2.4183, + "Musicality": 2.2883, + "Memorability": 2.0825, + "Clarity": 2.1587, + "Naturalness": 2.3078 + }, + "2_67": { + "Coherence": 2.3483, + "Musicality": 2.1847, + "Memorability": 2.09, + "Clarity": 2.1366, + "Naturalness": 2.1259 + }, + "3_96": { + "Coherence": 2.1818, + "Musicality": 2.2395, + "Memorability": 2.2308, + "Clarity": 2.0452, + "Naturalness": 2.1293 + }, + "0_66": { + "Coherence": 2.0316, + "Musicality": 1.8951, + "Memorability": 2.0299, + "Clarity": 1.8193, + "Naturalness": 1.9089 + }, + "4_53": { + "Coherence": 2.3561, + "Musicality": 2.4071, + "Memorability": 2.2317, + "Clarity": 2.336, + "Naturalness": 2.3277 + }, + "2_23": { + "Coherence": 2.3174, + "Musicality": 2.3013, + "Memorability": 2.3749, + "Clarity": 2.3659, + "Naturalness": 2.2551 + }, + "2_28": { + "Coherence": 2.1358, + "Musicality": 2.0423, + "Memorability": 1.9649, + "Clarity": 1.9885, + "Naturalness": 2.0853 + }, + "2_29": { + "Coherence": 2.0729, + "Musicality": 2.0636, + "Memorability": 2.1252, + "Clarity": 1.9903, + "Naturalness": 1.9401 + }, + "4_21": { + "Coherence": 2.3339, + "Musicality": 2.0501, + "Memorability": 1.9084, + "Clarity": 1.9564, + "Naturalness": 1.9006 + }, + "3_46": { + "Coherence": 2.0809, + "Musicality": 2.0113, + "Memorability": 1.9666, + "Clarity": 1.8907, + "Naturalness": 2.0141 + }, + "0_77": { + "Coherence": 2.0897, + "Musicality": 2.098, + "Memorability": 2.2421, + "Clarity": 2.1986, + "Naturalness": 2.0956 + }, + "4_86": { + "Coherence": 2.491, + "Musicality": 2.5162, + "Memorability": 2.32, + "Clarity": 2.3499, + "Naturalness": 2.4308 + }, + "2_4": { + "Coherence": 1.8087, + "Musicality": 1.7177, + "Memorability": 1.5451, + "Clarity": 1.7168, + "Naturalness": 1.6058 + }, + "0_88": { + "Coherence": 1.779, + "Musicality": 1.8651, + "Memorability": 1.7381, + "Clarity": 1.7405, + "Naturalness": 1.7942 + }, + "3_79": { + "Coherence": 2.4717, + "Musicality": 2.2998, + "Memorability": 2.382, + "Clarity": 2.2195, + "Naturalness": 2.1466 + }, + "3_77": { + "Coherence": 2.1888, + "Musicality": 2.0327, + "Memorability": 2.0769, + "Clarity": 1.9536, + "Naturalness": 2.2245 + }, + "1_74": { + "Coherence": 2.3644, + "Musicality": 2.3894, + "Memorability": 2.2264, + "Clarity": 2.2723, + "Naturalness": 2.3437 + }, + "0_75": { + "Coherence": 2.0146, + "Musicality": 1.9708, + "Memorability": 1.9617, + "Clarity": 1.8868, + "Naturalness": 1.9818 + }, + "1_28": { + "Coherence": 2.281, + "Musicality": 2.2385, + "Memorability": 2.1841, + "Clarity": 2.1268, + "Naturalness": 2.1962 + }, + "2_76": { + "Coherence": 1.9583, + "Musicality": 1.6395, + "Memorability": 1.7161, + "Clarity": 1.7739, + "Naturalness": 1.6589 + }, + "2_7": { + "Coherence": 1.7494, + "Musicality": 1.7288, + "Memorability": 1.6475, + "Clarity": 1.696, + "Naturalness": 1.6172 + }, + "2_63": { + "Coherence": 2.142, + "Musicality": 2.1304, + "Memorability": 1.8713, + "Clarity": 1.9539, + "Naturalness": 1.9687 + }, + "1_55": { + "Coherence": 2.454, + "Musicality": 2.5627, + "Memorability": 2.3505, + "Clarity": 2.2649, + "Naturalness": 2.3556 + }, + "1_57": { + "Coherence": 2.1894, + "Musicality": 1.944, + "Memorability": 2.0374, + "Clarity": 1.8814, + "Naturalness": 1.9284 + }, + "4_94": { + "Coherence": 1.839, + "Musicality": 1.7626, + "Memorability": 1.6898, + "Clarity": 1.6964, + "Naturalness": 1.734 + }, + "3_24": { + "Coherence": 2.2606, + "Musicality": 2.1091, + "Memorability": 2.0483, + "Clarity": 2.1631, + "Naturalness": 2.1043 + }, + "2_15": { + "Coherence": 2.2367, + "Musicality": 2.2431, + "Memorability": 2.1904, + "Clarity": 2.1406, + "Naturalness": 2.1539 + }, + "0_8": { + "Coherence": 2.4336, + "Musicality": 2.4236, + "Memorability": 2.5433, + "Clarity": 2.5246, + "Naturalness": 2.4657 + }, + "3_14": { + "Coherence": 1.9237, + "Musicality": 1.856, + "Memorability": 1.8864, + "Clarity": 1.7281, + "Naturalness": 1.8647 + }, + "1_88": { + "Coherence": 2.1056, + "Musicality": 2.0166, + "Memorability": 2.0529, + "Clarity": 1.9988, + "Naturalness": 2.0217 + }, + "1_8": { + "Coherence": 2.2008, + "Musicality": 1.8821, + "Memorability": 2.0604, + "Clarity": 2.0161, + "Naturalness": 2.0145 + }, + "1_0": { + "Coherence": 2.3727, + "Musicality": 2.4407, + "Memorability": 2.1786, + "Clarity": 2.2065, + "Naturalness": 2.172 + }, + "3_92": { + "Coherence": 2.3062, + "Musicality": 2.3895, + "Memorability": 2.282, + "Clarity": 2.1578, + "Naturalness": 2.1791 + }, + "0_73": { + "Coherence": 2.1344, + "Musicality": 2.0625, + "Memorability": 1.8351, + "Clarity": 1.9275, + "Naturalness": 1.8729 + }, + "4_81": { + "Coherence": 2.0691, + "Musicality": 1.9451, + "Memorability": 2.1633, + "Clarity": 2.1333, + "Naturalness": 2.0668 + }, + "2_92": { + "Coherence": 2.4402, + "Musicality": 2.29, + "Memorability": 2.3331, + "Clarity": 2.1726, + "Naturalness": 2.1892 + }, + "1_66": { + "Coherence": 2.7032, + "Musicality": 2.6471, + "Memorability": 2.4789, + "Clarity": 2.4478, + "Naturalness": 2.5586 + }, + "3_44": { + "Coherence": 2.1894, + "Musicality": 2.0205, + "Memorability": 2.0975, + "Clarity": 1.9897, + "Naturalness": 2.0595 + }, + "0_4": { + "Coherence": 2.6016, + "Musicality": 2.4601, + "Memorability": 2.3464, + "Clarity": 2.3192, + "Naturalness": 2.2662 + }, + "2_93": { + "Coherence": 1.9954, + "Musicality": 1.9995, + "Memorability": 1.9576, + "Clarity": 1.9181, + "Naturalness": 2.053 + }, + "0_64": { + "Coherence": 2.4691, + "Musicality": 2.6417, + "Memorability": 2.2994, + "Clarity": 2.2818, + "Naturalness": 2.4011 + }, + "3_40": { + "Coherence": 2.6494, + "Musicality": 2.5188, + "Memorability": 2.5081, + "Clarity": 2.5678, + "Naturalness": 2.4542 + }, + "4_29": { + "Coherence": 1.7034, + "Musicality": 1.7253, + "Memorability": 1.5533, + "Clarity": 1.588, + "Naturalness": 1.6467 + }, + "3_74": { + "Coherence": 2.3365, + "Musicality": 2.3321, + "Memorability": 2.1928, + "Clarity": 2.2858, + "Naturalness": 2.1803 + }, + "2_71": { + "Coherence": 1.7911, + "Musicality": 1.7773, + "Memorability": 1.785, + "Clarity": 1.6606, + "Naturalness": 1.7868 + }, + "2_45": { + "Coherence": 2.0612, + "Musicality": 2.2141, + "Memorability": 1.912, + "Clarity": 2.0512, + "Naturalness": 1.991 + }, + "1_58": { + "Coherence": 2.5347, + "Musicality": 2.4564, + "Memorability": 2.3709, + "Clarity": 2.3363, + "Naturalness": 2.4027 + }, + "2_24": { + "Coherence": 2.1706, + "Musicality": 2.073, + "Memorability": 2.0641, + "Clarity": 1.9986, + "Naturalness": 2.1331 + }, + "2_44": { + "Coherence": 2.4707, + "Musicality": 2.2782, + "Memorability": 2.2546, + "Clarity": 2.2037, + "Naturalness": 2.2995 + }, + "3_73": { + "Coherence": 2.2706, + "Musicality": 2.2547, + "Memorability": 2.0179, + "Clarity": 2.2202, + "Naturalness": 2.1244 + }, + "1_72": { + "Coherence": 1.5911, + "Musicality": 1.5279, + "Memorability": 1.6521, + "Clarity": 1.5473, + "Naturalness": 1.5907 + }, + "0_94": { + "Coherence": 1.8602, + "Musicality": 1.8602, + "Memorability": 1.7409, + "Clarity": 1.8094, + "Naturalness": 1.854 + }, + "3_0": { + "Coherence": 1.6971, + "Musicality": 1.5703, + "Memorability": 1.5548, + "Clarity": 1.5854, + "Naturalness": 1.6097 + }, + "0_95": { + "Coherence": 1.8323, + "Musicality": 1.8864, + "Memorability": 1.8162, + "Clarity": 1.8239, + "Naturalness": 1.8678 + }, + "1_20": { + "Coherence": 1.8756, + "Musicality": 1.9785, + "Memorability": 1.947, + "Clarity": 1.8313, + "Naturalness": 1.9069 + }, + "1_80": { + "Coherence": 1.9406, + "Musicality": 1.9605, + "Memorability": 1.928, + "Clarity": 1.9513, + "Naturalness": 2.054 + }, + "4_62": { + "Coherence": 1.5723, + "Musicality": 1.4671, + "Memorability": 1.4701, + "Clarity": 1.4424, + "Naturalness": 1.5156 + }, + "2_78": { + "Coherence": 2.4861, + "Musicality": 2.5491, + "Memorability": 2.3015, + "Clarity": 2.3086, + "Naturalness": 2.32 + }, + "0_65": { + "Coherence": 2.232, + "Musicality": 2.1086, + "Memorability": 2.2506, + "Clarity": 2.0765, + "Naturalness": 2.0796 + }, + "4_66": { + "Coherence": 2.3562, + "Musicality": 2.3646, + "Memorability": 2.3862, + "Clarity": 2.2813, + "Naturalness": 2.2441 + }, + "4_40": { + "Coherence": 2.1639, + "Musicality": 2.0781, + "Memorability": 2.1157, + "Clarity": 2.1316, + "Naturalness": 2.1229 + }, + "4_93": { + "Coherence": 2.2189, + "Musicality": 2.2275, + "Memorability": 2.0691, + "Clarity": 2.1112, + "Naturalness": 2.0928 + }, + "3_88": { + "Coherence": 2.3956, + "Musicality": 2.3006, + "Memorability": 2.0355, + "Clarity": 2.1448, + "Naturalness": 2.1649 + }, + "0_15": { + "Coherence": 2.2185, + "Musicality": 2.2944, + "Memorability": 2.0519, + "Clarity": 2.1389, + "Naturalness": 2.0885 + }, + "1_96": { + "Coherence": 2.3422, + "Musicality": 2.3445, + "Memorability": 2.2867, + "Clarity": 2.1522, + "Naturalness": 2.1611 + }, + "0_74": { + "Coherence": 2.0854, + "Musicality": 2.0287, + "Memorability": 1.9169, + "Clarity": 1.9461, + "Naturalness": 2.1022 + }, + "0_54": { + "Coherence": 1.568, + "Musicality": 1.5066, + "Memorability": 1.5506, + "Clarity": 1.4892, + "Naturalness": 1.5473 + }, + "2_97": { + "Coherence": 2.0816, + "Musicality": 2.1353, + "Memorability": 1.9688, + "Clarity": 1.9818, + "Naturalness": 1.9441 + }, + "2_74": { + "Coherence": 1.9713, + "Musicality": 2.1465, + "Memorability": 1.9864, + "Clarity": 2.0631, + "Naturalness": 2.0106 + }, + "0_27": { + "Coherence": 2.8013, + "Musicality": 2.7117, + "Memorability": 2.4925, + "Clarity": 2.6482, + "Naturalness": 2.6713 + }, + "0_33": { + "Coherence": 2.3914, + "Musicality": 2.432, + "Memorability": 2.2417, + "Clarity": 2.4649, + "Naturalness": 2.4498 + }, + "1_2": { + "Coherence": 2.2006, + "Musicality": 2.1381, + "Memorability": 2.0668, + "Clarity": 2.0994, + "Naturalness": 2.1751 + }, + "4_5": { + "Coherence": 2.2028, + "Musicality": 2.2548, + "Memorability": 2.094, + "Clarity": 2.104, + "Naturalness": 2.0769 + }, + "1_82": { + "Coherence": 2.1139, + "Musicality": 2.0486, + "Memorability": 1.9668, + "Clarity": 2.0287, + "Naturalness": 1.983 + }, + "0_38": { + "Coherence": 2.2971, + "Musicality": 2.4286, + "Memorability": 2.334, + "Clarity": 2.2721, + "Naturalness": 2.3503 + }, + "4_59": { + "Coherence": 2.5923, + "Musicality": 2.5951, + "Memorability": 2.6329, + "Clarity": 2.5983, + "Naturalness": 2.5336 + }, + "3_71": { + "Coherence": 2.3849, + "Musicality": 2.0994, + "Memorability": 2.1678, + "Clarity": 2.1002, + "Naturalness": 2.1303 + }, + "2_35": { + "Coherence": 2.2882, + "Musicality": 2.1923, + "Memorability": 2.1158, + "Clarity": 2.0651, + "Naturalness": 2.2521 + }, + "3_86": { + "Coherence": 2.5946, + "Musicality": 2.5866, + "Memorability": 2.3881, + "Clarity": 2.4527, + "Naturalness": 2.3642 + }, + "2_14": { + "Coherence": 2.5263, + "Musicality": 2.3986, + "Memorability": 2.2717, + "Clarity": 2.2873, + "Naturalness": 2.3597 + }, + "3_42": { + "Coherence": 1.9662, + "Musicality": 1.9749, + "Memorability": 1.9184, + "Clarity": 1.8202, + "Naturalness": 1.9593 + }, + "3_48": { + "Coherence": 1.8617, + "Musicality": 1.8093, + "Memorability": 1.8166, + "Clarity": 1.8295, + "Naturalness": 1.8602 + }, + "3_31": { + "Coherence": 2.4623, + "Musicality": 2.4417, + "Memorability": 2.1803, + "Clarity": 2.2882, + "Naturalness": 2.2492 + }, + "1_32": { + "Coherence": 2.6004, + "Musicality": 2.5097, + "Memorability": 2.3734, + "Clarity": 2.3893, + "Naturalness": 2.4 + }, + "4_7": { + "Coherence": 2.297, + "Musicality": 2.3344, + "Memorability": 2.3356, + "Clarity": 2.1336, + "Naturalness": 2.2263 + }, + "3_93": { + "Coherence": 2.2957, + "Musicality": 2.1905, + "Memorability": 2.1554, + "Clarity": 2.0281, + "Naturalness": 2.207 + }, + "4_42": { + "Coherence": 2.2719, + "Musicality": 2.252, + "Memorability": 2.0813, + "Clarity": 2.1453, + "Naturalness": 2.1588 + }, + "0_91": { + "Coherence": 2.141, + "Musicality": 2.0821, + "Memorability": 2.0982, + "Clarity": 1.983, + "Naturalness": 1.9613 + }, + "3_27": { + "Coherence": 1.8279, + "Musicality": 1.7477, + "Memorability": 1.7982, + "Clarity": 1.6296, + "Naturalness": 1.7693 + }, + "3_54": { + "Coherence": 2.3305, + "Musicality": 2.3382, + "Memorability": 2.4071, + "Clarity": 2.2438, + "Naturalness": 2.2249 + }, + "1_24": { + "Coherence": 2.185, + "Musicality": 2.36, + "Memorability": 2.2286, + "Clarity": 2.1736, + "Naturalness": 2.1503 + }, + "0_14": { + "Coherence": 2.1015, + "Musicality": 2.1473, + "Memorability": 2.031, + "Clarity": 2.0198, + "Naturalness": 2.0559 + }, + "3_2": { + "Coherence": 2.0178, + "Musicality": 1.9007, + "Memorability": 2.0232, + "Clarity": 1.9892, + "Naturalness": 2.0074 + }, + "3_10": { + "Coherence": 2.3047, + "Musicality": 2.365, + "Memorability": 2.3817, + "Clarity": 2.208, + "Naturalness": 2.2809 + }, + "3_91": { + "Coherence": 2.2553, + "Musicality": 2.2049, + "Memorability": 2.2052, + "Clarity": 2.0175, + "Naturalness": 2.1619 + }, + "0_2": { + "Coherence": 2.1873, + "Musicality": 2.1483, + "Memorability": 1.9162, + "Clarity": 2.0543, + "Naturalness": 2.1214 + }, + "4_3": { + "Coherence": 2.4972, + "Musicality": 2.3255, + "Memorability": 2.3075, + "Clarity": 2.2674, + "Naturalness": 2.4217 + }, + "0_28": { + "Coherence": 2.4347, + "Musicality": 2.4768, + "Memorability": 2.2624, + "Clarity": 2.2847, + "Naturalness": 2.3786 + }, + "3_5": { + "Coherence": 1.989, + "Musicality": 1.9532, + "Memorability": 1.8144, + "Clarity": 1.8353, + "Naturalness": 1.9482 + }, + "3_84": { + "Coherence": 2.2007, + "Musicality": 2.0882, + "Memorability": 1.9826, + "Clarity": 1.8819, + "Naturalness": 1.9807 + }, + "4_18": { + "Coherence": 2.2287, + "Musicality": 2.1163, + "Memorability": 2.0292, + "Clarity": 2.0164, + "Naturalness": 2.0719 + }, + "3_49": { + "Coherence": 2.2148, + "Musicality": 1.9555, + "Memorability": 1.8934, + "Clarity": 1.9539, + "Naturalness": 2.0419 + }, + "4_70": { + "Coherence": 1.9702, + "Musicality": 2.0574, + "Memorability": 1.9396, + "Clarity": 1.8764, + "Naturalness": 1.8685 + }, + "0_68": { + "Coherence": 1.9775, + "Musicality": 1.9256, + "Memorability": 1.9359, + "Clarity": 1.9169, + "Naturalness": 1.9153 + }, + "2_11": { + "Coherence": 2.1669, + "Musicality": 2.02, + "Memorability": 2.0331, + "Clarity": 2.0185, + "Naturalness": 2.1166 + }, + "4_49": { + "Coherence": 2.595, + "Musicality": 2.3318, + "Memorability": 2.038, + "Clarity": 2.3474, + "Naturalness": 2.2379 + }, + "1_93": { + "Coherence": 2.309, + "Musicality": 2.175, + "Memorability": 2.2298, + "Clarity": 2.0778, + "Naturalness": 2.1573 + }, + "0_89": { + "Coherence": 2.6534, + "Musicality": 2.697, + "Memorability": 2.5988, + "Clarity": 2.448, + "Naturalness": 2.3883 + }, + "3_66": { + "Coherence": 2.1432, + "Musicality": 2.1136, + "Memorability": 2.125, + "Clarity": 2.0783, + "Naturalness": 2.3627 + }, + "0_21": { + "Coherence": 2.5321, + "Musicality": 2.4141, + "Memorability": 2.3183, + "Clarity": 2.261, + "Naturalness": 2.2971 + }, + "0_1": { + "Coherence": 1.5446, + "Musicality": 1.5321, + "Memorability": 1.5399, + "Clarity": 1.471, + "Naturalness": 1.4825 + }, + "2_79": { + "Coherence": 2.5557, + "Musicality": 2.5329, + "Memorability": 2.52, + "Clarity": 2.489, + "Naturalness": 2.6073 + }, + "4_22": { + "Coherence": 1.8591, + "Musicality": 1.9253, + "Memorability": 1.8709, + "Clarity": 1.8276, + "Naturalness": 1.8137 + }, + "1_56": { + "Coherence": 2.3548, + "Musicality": 2.3963, + "Memorability": 2.2703, + "Clarity": 2.327, + "Naturalness": 2.2915 + }, + "1_39": { + "Coherence": 1.9871, + "Musicality": 1.689, + "Memorability": 1.92, + "Clarity": 1.7833, + "Naturalness": 1.8184 + }, + "1_86": { + "Coherence": 2.2606, + "Musicality": 2.2865, + "Memorability": 2.2581, + "Clarity": 2.3054, + "Naturalness": 2.2113 + }, + "0_85": { + "Coherence": 1.9186, + "Musicality": 1.9081, + "Memorability": 1.9889, + "Clarity": 1.7794, + "Naturalness": 1.8494 + }, + "3_61": { + "Coherence": 1.9655, + "Musicality": 1.9839, + "Memorability": 1.8955, + "Clarity": 1.9394, + "Naturalness": 1.9518 + }, + "0_92": { + "Coherence": 2.2782, + "Musicality": 2.4049, + "Memorability": 2.2771, + "Clarity": 2.1731, + "Naturalness": 2.2203 + }, + "0_16": { + "Coherence": 1.9612, + "Musicality": 2.0146, + "Memorability": 1.7677, + "Clarity": 1.8591, + "Naturalness": 2.0399 + }, + "3_22": { + "Coherence": 2.1536, + "Musicality": 2.0755, + "Memorability": 1.9017, + "Clarity": 1.9414, + "Naturalness": 2.0436 + }, + "0_79": { + "Coherence": 2.1513, + "Musicality": 2.1437, + "Memorability": 1.9734, + "Clarity": 1.9043, + "Naturalness": 2.0233 + }, + "0_71": { + "Coherence": 2.4485, + "Musicality": 2.3952, + "Memorability": 2.3545, + "Clarity": 2.3959, + "Naturalness": 2.3662 + }, + "3_26": { + "Coherence": 2.0978, + "Musicality": 2.1113, + "Memorability": 2.0101, + "Clarity": 2.0855, + "Naturalness": 2.0965 + }, + "0_47": { + "Coherence": 2.2464, + "Musicality": 2.3119, + "Memorability": 2.4551, + "Clarity": 2.1364, + "Naturalness": 2.1958 + }, + "0_70": { + "Coherence": 2.1887, + "Musicality": 2.0891, + "Memorability": 2.1789, + "Clarity": 1.8599, + "Naturalness": 1.9961 + }, + "2_82": { + "Coherence": 2.0149, + "Musicality": 1.9833, + "Memorability": 1.984, + "Clarity": 1.8742, + "Naturalness": 1.9466 + }, + "1_37": { + "Coherence": 2.2924, + "Musicality": 2.2435, + "Memorability": 2.2556, + "Clarity": 2.1273, + "Naturalness": 2.2784 + }, + "3_32": { + "Coherence": 2.3229, + "Musicality": 2.2298, + "Memorability": 2.1514, + "Clarity": 2.1442, + "Naturalness": 2.1044 + }, + "2_40": { + "Coherence": 2.2995, + "Musicality": 2.3528, + "Memorability": 2.179, + "Clarity": 2.1627, + "Naturalness": 2.294 + }, + "3_15": { + "Coherence": 1.7409, + "Musicality": 1.6818, + "Memorability": 1.4963, + "Clarity": 1.6333, + "Naturalness": 1.6013 + }, + "1_42": { + "Coherence": 2.6129, + "Musicality": 2.5839, + "Memorability": 2.4966, + "Clarity": 2.429, + "Naturalness": 2.3687 + }, + "0_51": { + "Coherence": 2.143, + "Musicality": 2.2888, + "Memorability": 2.1488, + "Clarity": 2.1636, + "Naturalness": 2.1598 + }, + "1_16": { + "Coherence": 2.2393, + "Musicality": 2.247, + "Memorability": 2.2046, + "Clarity": 2.248, + "Naturalness": 2.2468 + }, + "2_49": { + "Coherence": 2.0881, + "Musicality": 2.2131, + "Memorability": 1.8822, + "Clarity": 2.0395, + "Naturalness": 1.9794 + }, + "1_84": { + "Coherence": 2.359, + "Musicality": 2.2389, + "Memorability": 2.2024, + "Clarity": 2.2048, + "Naturalness": 2.2222 + }, + "0_32": { + "Coherence": 2.3477, + "Musicality": 2.2993, + "Memorability": 2.1484, + "Clarity": 2.2419, + "Naturalness": 2.2541 + }, + "2_68": { + "Coherence": 2.3435, + "Musicality": 2.4809, + "Memorability": 2.3231, + "Clarity": 2.1379, + "Naturalness": 2.2375 + }, + "2_31": { + "Coherence": 2.2846, + "Musicality": 2.1809, + "Memorability": 2.2819, + "Clarity": 2.137, + "Naturalness": 2.2179 + }, + "2_6": { + "Coherence": 2.4279, + "Musicality": 2.2806, + "Memorability": 2.0665, + "Clarity": 2.2548, + "Naturalness": 2.1858 + }, + "4_34": { + "Coherence": 2.433, + "Musicality": 2.3738, + "Memorability": 2.3242, + "Clarity": 2.3203, + "Naturalness": 2.2842 + }, + "3_20": { + "Coherence": 2.4709, + "Musicality": 2.2068, + "Memorability": 2.3481, + "Clarity": 2.1515, + "Naturalness": 2.1738 + }, + "2_26": { + "Coherence": 2.761, + "Musicality": 2.9227, + "Memorability": 2.764, + "Clarity": 2.6298, + "Naturalness": 2.6233 + }, + "1_68": { + "Coherence": 2.5299, + "Musicality": 2.5349, + "Memorability": 2.3781, + "Clarity": 2.246, + "Naturalness": 2.4304 + }, + "2_25": { + "Coherence": 2.1776, + "Musicality": 2.0596, + "Memorability": 2.1202, + "Clarity": 1.9972, + "Naturalness": 2.0727 + }, + "1_69": { + "Coherence": 1.5309, + "Musicality": 1.461, + "Memorability": 1.6637, + "Clarity": 1.4651, + "Naturalness": 1.6969 + }, + "3_69": { + "Coherence": 2.1159, + "Musicality": 1.9865, + "Memorability": 1.8409, + "Clarity": 1.944, + "Naturalness": 1.8649 + }, + "4_19": { + "Coherence": 2.3554, + "Musicality": 2.29, + "Memorability": 2.2956, + "Clarity": 2.237, + "Naturalness": 2.3469 + }, + "4_4": { + "Coherence": 1.9869, + "Musicality": 1.7675, + "Memorability": 1.8982, + "Clarity": 1.7451, + "Naturalness": 1.7462 + }, + "4_57": { + "Coherence": 2.4678, + "Musicality": 2.5522, + "Memorability": 2.335, + "Clarity": 2.4827, + "Naturalness": 2.4377 + }, + "4_75": { + "Coherence": 2.1265, + "Musicality": 2.2157, + "Memorability": 2.1694, + "Clarity": 2.1359, + "Naturalness": 2.2256 + }, + "0_72": { + "Coherence": 2.3192, + "Musicality": 2.1476, + "Memorability": 2.0943, + "Clarity": 2.1586, + "Naturalness": 2.1182 + }, + "2_55": { + "Coherence": 1.8826, + "Musicality": 1.7765, + "Memorability": 1.8529, + "Clarity": 1.8804, + "Naturalness": 1.8853 + }, + "2_18": { + "Coherence": 2.302, + "Musicality": 2.2897, + "Memorability": 2.3549, + "Clarity": 2.112, + "Naturalness": 2.2913 + }, + "3_65": { + "Coherence": 2.3258, + "Musicality": 2.1564, + "Memorability": 2.199, + "Clarity": 2.1968, + "Naturalness": 2.177 + }, + "3_63": { + "Coherence": 2.8712, + "Musicality": 2.8692, + "Memorability": 2.8522, + "Clarity": 2.7607, + "Naturalness": 2.6322 + }, + "4_25": { + "Coherence": 2.312, + "Musicality": 2.2323, + "Memorability": 2.0557, + "Clarity": 2.1357, + "Naturalness": 2.2097 + }, + "4_52": { + "Coherence": 1.9228, + "Musicality": 1.8919, + "Memorability": 1.8349, + "Clarity": 1.8561, + "Naturalness": 1.9062 + }, + "4_1": { + "Coherence": 2.1892, + "Musicality": 2.1107, + "Memorability": 2.15, + "Clarity": 2.1533, + "Naturalness": 2.1818 + }, + "2_94": { + "Coherence": 2.7352, + "Musicality": 2.63, + "Memorability": 2.3537, + "Clarity": 2.4471, + "Naturalness": 2.5075 + }, + "2_32": { + "Coherence": 2.0399, + "Musicality": 2.0722, + "Memorability": 2.1334, + "Clarity": 1.9989, + "Naturalness": 2.1709 + }, + "2_20": { + "Coherence": 2.4868, + "Musicality": 2.4854, + "Memorability": 2.3528, + "Clarity": 2.4241, + "Naturalness": 2.4396 + }, + "4_16": { + "Coherence": 2.2808, + "Musicality": 2.1759, + "Memorability": 2.1629, + "Clarity": 2.0831, + "Naturalness": 2.2275 + }, + "4_61": { + "Coherence": 1.8089, + "Musicality": 1.8357, + "Memorability": 1.8118, + "Clarity": 1.783, + "Naturalness": 1.7906 + }, + "2_33": { + "Coherence": 2.2052, + "Musicality": 2.0665, + "Memorability": 2.0094, + "Clarity": 2.102, + "Naturalness": 2.0444 + }, + "2_10": { + "Coherence": 2.2482, + "Musicality": 2.1705, + "Memorability": 1.9102, + "Clarity": 2.0577, + "Naturalness": 2.0466 + }, + "2_0": { + "Coherence": 1.8268, + "Musicality": 1.7328, + "Memorability": 1.7469, + "Clarity": 1.6844, + "Naturalness": 1.8355 + }, + "2_83": { + "Coherence": 2.4584, + "Musicality": 2.3891, + "Memorability": 2.2064, + "Clarity": 2.244, + "Naturalness": 2.3245 + }, + "2_16": { + "Coherence": 2.1736, + "Musicality": 1.9615, + "Memorability": 1.9975, + "Clarity": 1.9478, + "Naturalness": 1.9718 + }, + "0_31": { + "Coherence": 2.7925, + "Musicality": 2.5672, + "Memorability": 2.793, + "Clarity": 2.6695, + "Naturalness": 2.7267 + }, + "4_32": { + "Coherence": 2.2045, + "Musicality": 2.0841, + "Memorability": 2.108, + "Clarity": 2.0678, + "Naturalness": 2.0925 + }, + "1_76": { + "Coherence": 1.927, + "Musicality": 1.9184, + "Memorability": 2.0669, + "Clarity": 1.9465, + "Naturalness": 1.9366 + }, + "1_43": { + "Coherence": 2.3344, + "Musicality": 2.4682, + "Memorability": 2.3157, + "Clarity": 2.3336, + "Naturalness": 2.2014 + }, + "1_54": { + "Coherence": 1.9669, + "Musicality": 1.9557, + "Memorability": 1.9318, + "Clarity": 1.8269, + "Naturalness": 1.8148 + }, + "4_43": { + "Coherence": 2.5014, + "Musicality": 2.5679, + "Memorability": 2.2708, + "Clarity": 2.4154, + "Naturalness": 2.4212 + }, + "4_77": { + "Coherence": 2.0359, + "Musicality": 2.0298, + "Memorability": 1.7116, + "Clarity": 1.8409, + "Naturalness": 1.8259 + }, + "0_58": { + "Coherence": 1.9356, + "Musicality": 1.947, + "Memorability": 1.9151, + "Clarity": 1.8816, + "Naturalness": 1.954 + }, + "2_36": { + "Coherence": 2.3709, + "Musicality": 2.3627, + "Memorability": 2.3401, + "Clarity": 2.2308, + "Naturalness": 2.2637 + }, + "1_65": { + "Coherence": 2.4607, + "Musicality": 2.4541, + "Memorability": 2.1764, + "Clarity": 2.235, + "Naturalness": 2.2667 + }, + "0_26": { + "Coherence": 2.2933, + "Musicality": 2.1993, + "Memorability": 2.1394, + "Clarity": 2.1676, + "Naturalness": 2.1097 + }, + "2_65": { + "Coherence": 2.0945, + "Musicality": 2.0679, + "Memorability": 1.8917, + "Clarity": 2.0363, + "Naturalness": 1.9987 + }, + "2_52": { + "Coherence": 2.2121, + "Musicality": 2.3237, + "Memorability": 2.2339, + "Clarity": 2.2382, + "Naturalness": 2.2416 + }, + "0_93": { + "Coherence": 2.1196, + "Musicality": 1.9865, + "Memorability": 1.9946, + "Clarity": 1.9056, + "Naturalness": 1.9408 + }, + "4_63": { + "Coherence": 2.5936, + "Musicality": 2.4613, + "Memorability": 2.4061, + "Clarity": 2.37, + "Naturalness": 2.403 + }, + "0_78": { + "Coherence": 2.1615, + "Musicality": 1.9972, + "Memorability": 1.9241, + "Clarity": 2.0307, + "Naturalness": 2.0021 + }, + "4_89": { + "Coherence": 2.0578, + "Musicality": 2.1219, + "Memorability": 1.988, + "Clarity": 1.9906, + "Naturalness": 1.9254 + }, + "4_76": { + "Coherence": 2.2024, + "Musicality": 2.2634, + "Memorability": 2.2442, + "Clarity": 2.2597, + "Naturalness": 2.2427 + }, + "1_62": { + "Coherence": 2.4221, + "Musicality": 2.2359, + "Memorability": 2.1999, + "Clarity": 2.2348, + "Naturalness": 2.1607 + }, + "3_30": { + "Coherence": 2.1491, + "Musicality": 2.1072, + "Memorability": 2.0558, + "Clarity": 2.0153, + "Naturalness": 1.9973 + }, + "4_0": { + "Coherence": 1.716, + "Musicality": 1.7595, + "Memorability": 1.5804, + "Clarity": 1.6338, + "Naturalness": 1.6699 + }, + "2_57": { + "Coherence": 1.6038, + "Musicality": 1.6037, + "Memorability": 1.5694, + "Clarity": 1.5594, + "Naturalness": 1.5852 + }, + "0_44": { + "Coherence": 2.1459, + "Musicality": 2.0758, + "Memorability": 1.8967, + "Clarity": 1.932, + "Naturalness": 1.9503 + }, + "3_72": { + "Coherence": 2.4176, + "Musicality": 2.4026, + "Memorability": 2.3219, + "Clarity": 2.2086, + "Naturalness": 2.2489 + }, + "1_59": { + "Coherence": 2.2601, + "Musicality": 2.1113, + "Memorability": 1.9935, + "Clarity": 1.9936, + "Naturalness": 2.0203 + }, + "1_92": { + "Coherence": 1.9006, + "Musicality": 1.8884, + "Memorability": 1.9651, + "Clarity": 1.8136, + "Naturalness": 1.7773 + }, + "1_6": { + "Coherence": 1.5059, + "Musicality": 1.4867, + "Memorability": 1.5298, + "Clarity": 1.4985, + "Naturalness": 1.5145 + }, + "3_43": { + "Coherence": 2.8295, + "Musicality": 2.7372, + "Memorability": 2.6963, + "Clarity": 2.6524, + "Naturalness": 2.5917 + }, + "1_85": { + "Coherence": 2.1031, + "Musicality": 1.9526, + "Memorability": 2.0542, + "Clarity": 1.9236, + "Naturalness": 2.0189 + }, + "4_79": { + "Coherence": 1.7147, + "Musicality": 1.6407, + "Memorability": 1.6568, + "Clarity": 1.6287, + "Naturalness": 1.6046 + }, + "4_69": { + "Coherence": 2.4221, + "Musicality": 2.2589, + "Memorability": 2.4544, + "Clarity": 2.2849, + "Naturalness": 2.3705 + }, + "1_97": { + "Coherence": 1.9869, + "Musicality": 1.9427, + "Memorability": 1.7226, + "Clarity": 1.872, + "Naturalness": 1.8986 + }, + "1_11": { + "Coherence": 2.2454, + "Musicality": 2.2705, + "Memorability": 1.9677, + "Clarity": 2.1494, + "Naturalness": 2.1263 + }, + "0_12": { + "Coherence": 2.1644, + "Musicality": 2.1354, + "Memorability": 2.0599, + "Clarity": 2.0154, + "Naturalness": 2.0486 + }, + "0_43": { + "Coherence": 2.3703, + "Musicality": 2.3369, + "Memorability": 2.0079, + "Clarity": 2.1595, + "Naturalness": 2.2462 + }, + "1_10": { + "Coherence": 2.8041, + "Musicality": 2.828, + "Memorability": 2.6825, + "Clarity": 2.6604, + "Naturalness": 2.5805 + }, + "3_45": { + "Coherence": 1.494, + "Musicality": 1.4763, + "Memorability": 1.5155, + "Clarity": 1.4437, + "Naturalness": 1.4177 + }, + "0_50": { + "Coherence": 1.952, + "Musicality": 2.0015, + "Memorability": 1.9009, + "Clarity": 1.954, + "Naturalness": 2.0498 + }, + "1_45": { + "Coherence": 2.0283, + "Musicality": 2.0581, + "Memorability": 1.9296, + "Clarity": 1.8813, + "Naturalness": 1.9802 + }, + "1_78": { + "Coherence": 1.4743, + "Musicality": 1.479, + "Memorability": 1.4332, + "Clarity": 1.4293, + "Naturalness": 1.4072 + }, + "2_62": { + "Coherence": 2.1068, + "Musicality": 1.9854, + "Memorability": 2.1011, + "Clarity": 2.0366, + "Naturalness": 2.1211 + }, + "0_67": { + "Coherence": 2.5199, + "Musicality": 2.6554, + "Memorability": 2.3816, + "Clarity": 2.4124, + "Naturalness": 2.482 + }, + "4_23": { + "Coherence": 2.1429, + "Musicality": 2.1081, + "Memorability": 2.0469, + "Clarity": 1.9779, + "Naturalness": 2.1301 + }, + "1_21": { + "Coherence": 1.6967, + "Musicality": 1.546, + "Memorability": 1.6842, + "Clarity": 1.5955, + "Naturalness": 1.6175 + }, + "2_91": { + "Coherence": 1.7161, + "Musicality": 1.647, + "Memorability": 1.6533, + "Clarity": 1.6559, + "Naturalness": 1.5782 + }, + "0_87": { + "Coherence": 2.4759, + "Musicality": 2.3796, + "Memorability": 2.2335, + "Clarity": 2.2174, + "Naturalness": 2.3454 + }, + "2_73": { + "Coherence": 1.9303, + "Musicality": 1.9323, + "Memorability": 1.8936, + "Clarity": 1.7804, + "Naturalness": 1.8337 + }, + "4_84": { + "Coherence": 2.2965, + "Musicality": 2.2682, + "Memorability": 2.0356, + "Clarity": 2.1209, + "Naturalness": 2.1255 + }, + "0_81": { + "Coherence": 2.505, + "Musicality": 2.2662, + "Memorability": 2.3091, + "Clarity": 2.3909, + "Naturalness": 2.386 + }, + "0_41": { + "Coherence": 2.4277, + "Musicality": 2.459, + "Memorability": 2.4047, + "Clarity": 2.3466, + "Naturalness": 2.2622 + }, + "4_55": { + "Coherence": 2.0315, + "Musicality": 2.0147, + "Memorability": 1.9808, + "Clarity": 1.9455, + "Naturalness": 2.0539 + }, + "2_48": { + "Coherence": 1.5202, + "Musicality": 1.4762, + "Memorability": 1.5298, + "Clarity": 1.554, + "Naturalness": 1.6199 + }, + "1_51": { + "Coherence": 1.4029, + "Musicality": 1.4056, + "Memorability": 1.391, + "Clarity": 1.4451, + "Naturalness": 1.4142 + }, + "0_11": { + "Coherence": 2.2915, + "Musicality": 2.386, + "Memorability": 2.2595, + "Clarity": 2.2164, + "Naturalness": 2.328 + }, + "1_91": { + "Coherence": 2.2548, + "Musicality": 2.3364, + "Memorability": 2.2506, + "Clarity": 2.141, + "Naturalness": 2.2084 + }, + "2_80": { + "Coherence": 2.2379, + "Musicality": 2.3012, + "Memorability": 2.3898, + "Clarity": 2.1524, + "Naturalness": 2.1655 + }, + "0_19": { + "Coherence": 2.104, + "Musicality": 2.045, + "Memorability": 2.0121, + "Clarity": 1.947, + "Naturalness": 2.0452 + }, + "1_64": { + "Coherence": 2.2254, + "Musicality": 2.242, + "Memorability": 1.9814, + "Clarity": 2.1224, + "Naturalness": 2.099 + }, + "3_55": { + "Coherence": 2.62, + "Musicality": 2.5255, + "Memorability": 2.39, + "Clarity": 2.398, + "Naturalness": 2.4008 + }, + "3_37": { + "Coherence": 1.7342, + "Musicality": 1.7382, + "Memorability": 1.7473, + "Clarity": 1.6751, + "Naturalness": 1.7243 + }, + "1_9": { + "Coherence": 1.9233, + "Musicality": 1.8755, + "Memorability": 1.8503, + "Clarity": 1.8165, + "Naturalness": 1.9132 + }, + "0_39": { + "Coherence": 2.3557, + "Musicality": 2.1258, + "Memorability": 2.1873, + "Clarity": 2.1679, + "Naturalness": 2.228 + }, + "2_39": { + "Coherence": 2.0775, + "Musicality": 2.1469, + "Memorability": 1.9039, + "Clarity": 2.1632, + "Naturalness": 2.0126 + }, + "2_69": { + "Coherence": 2.8159, + "Musicality": 2.7529, + "Memorability": 2.5734, + "Clarity": 2.5901, + "Naturalness": 2.5003 + }, + "2_81": { + "Coherence": 1.7624, + "Musicality": 1.7138, + "Memorability": 1.6759, + "Clarity": 1.714, + "Naturalness": 1.6316 + }, + "3_58": { + "Coherence": 1.9916, + "Musicality": 1.8992, + "Memorability": 1.992, + "Clarity": 1.9266, + "Naturalness": 1.9128 + }, + "1_99": { + "Coherence": 1.945, + "Musicality": 2.1036, + "Memorability": 1.8971, + "Clarity": 1.8253, + "Naturalness": 1.8982 + }, + "0_99": { + "Coherence": 2.4951, + "Musicality": 2.3151, + "Memorability": 2.4236, + "Clarity": 2.4955, + "Naturalness": 2.4378 + }, + "3_17": { + "Coherence": 2.5045, + "Musicality": 2.3734, + "Memorability": 2.5493, + "Clarity": 2.4323, + "Naturalness": 2.4147 + }, + "3_50": { + "Coherence": 2.6205, + "Musicality": 2.6439, + "Memorability": 2.4681, + "Clarity": 2.6855, + "Naturalness": 2.5049 + }, + "4_91": { + "Coherence": 2.3077, + "Musicality": 2.3176, + "Memorability": 2.1234, + "Clarity": 2.1738, + "Naturalness": 2.3045 + }, + "3_56": { + "Coherence": 2.1082, + "Musicality": 2.011, + "Memorability": 2.0182, + "Clarity": 2.016, + "Naturalness": 2.1074 + }, + "3_82": { + "Coherence": 1.7938, + "Musicality": 1.9296, + "Memorability": 1.9261, + "Clarity": 1.7735, + "Naturalness": 1.9263 + }, + "4_17": { + "Coherence": 2.0995, + "Musicality": 2.0639, + "Memorability": 1.8537, + "Clarity": 1.9828, + "Naturalness": 1.9461 + }, + "4_72": { + "Coherence": 2.6618, + "Musicality": 2.4627, + "Memorability": 2.4604, + "Clarity": 2.3689, + "Naturalness": 2.3129 + }, + "4_10": { + "Coherence": 1.8977, + "Musicality": 1.8834, + "Memorability": 1.9509, + "Clarity": 1.8282, + "Naturalness": 1.899 + }, + "4_74": { + "Coherence": 1.8133, + "Musicality": 1.6628, + "Memorability": 1.6591, + "Clarity": 1.621, + "Naturalness": 1.857 + }, + "0_7": { + "Coherence": 2.3334, + "Musicality": 2.1764, + "Memorability": 2.4455, + "Clarity": 2.2542, + "Naturalness": 2.2119 + }, + "0_53": { + "Coherence": 1.9901, + "Musicality": 1.9633, + "Memorability": 1.9243, + "Clarity": 1.8607, + "Naturalness": 1.8191 + }, + "3_83": { + "Coherence": 2.1562, + "Musicality": 2.2071, + "Memorability": 1.9081, + "Clarity": 2.1533, + "Naturalness": 1.9599 + }, + "3_35": { + "Coherence": 2.0654, + "Musicality": 2.0104, + "Memorability": 1.9849, + "Clarity": 1.9511, + "Naturalness": 2.0346 + }, + "4_88": { + "Coherence": 2.2395, + "Musicality": 2.1779, + "Memorability": 2.1249, + "Clarity": 2.1083, + "Naturalness": 2.2168 + }, + "1_52": { + "Coherence": 1.9105, + "Musicality": 1.905, + "Memorability": 1.7679, + "Clarity": 1.8393, + "Naturalness": 1.8354 + }, + "1_95": { + "Coherence": 2.1638, + "Musicality": 2.106, + "Memorability": 2.2122, + "Clarity": 2.1135, + "Naturalness": 2.0643 + }, + "1_1": { + "Coherence": 1.8786, + "Musicality": 1.8373, + "Memorability": 1.8086, + "Clarity": 1.746, + "Naturalness": 1.9081 + }, + "3_51": { + "Coherence": 2.0104, + "Musicality": 2.0662, + "Memorability": 1.987, + "Clarity": 1.8889, + "Naturalness": 1.9966 + }, + "4_73": { + "Coherence": 2.2588, + "Musicality": 2.0573, + "Memorability": 2.3054, + "Clarity": 2.1202, + "Naturalness": 2.1212 + }, + "4_39": { + "Coherence": 2.1785, + "Musicality": 2.0144, + "Memorability": 1.9765, + "Clarity": 2.0063, + "Naturalness": 2.0919 + }, + "2_85": { + "Coherence": 2.2436, + "Musicality": 2.2585, + "Memorability": 2.1735, + "Clarity": 2.1191, + "Naturalness": 2.1095 + }, + "0_57": { + "Coherence": 2.2371, + "Musicality": 2.0397, + "Memorability": 2.1769, + "Clarity": 2.0651, + "Naturalness": 2.0733 + }, + "2_60": { + "Coherence": 1.8551, + "Musicality": 1.9177, + "Memorability": 2.0625, + "Clarity": 1.9154, + "Naturalness": 1.943 + }, + "4_90": { + "Coherence": 2.0694, + "Musicality": 2.1923, + "Memorability": 2.1006, + "Clarity": 1.8264, + "Naturalness": 1.9136 + }, + "1_53": { + "Coherence": 2.0543, + "Musicality": 2.0252, + "Memorability": 1.8597, + "Clarity": 1.9347, + "Naturalness": 1.908 + }, + "4_20": { + "Coherence": 2.5224, + "Musicality": 2.4856, + "Memorability": 2.3748, + "Clarity": 2.3196, + "Naturalness": 2.3066 + }, + "3_90": { + "Coherence": 2.6885, + "Musicality": 2.7295, + "Memorability": 2.3706, + "Clarity": 2.5435, + "Naturalness": 2.53 + }, + "0_82": { + "Coherence": 2.0804, + "Musicality": 1.9735, + "Memorability": 2.0409, + "Clarity": 2.0917, + "Naturalness": 2.0347 + }, + "2_95": { + "Coherence": 2.3756, + "Musicality": 2.4778, + "Memorability": 2.3212, + "Clarity": 2.2432, + "Naturalness": 2.3635 + }, + "0_40": { + "Coherence": 2.0068, + "Musicality": 1.9907, + "Memorability": 2.1615, + "Clarity": 2.0915, + "Naturalness": 2.0913 + }, + "1_87": { + "Coherence": 2.3678, + "Musicality": 2.2955, + "Memorability": 2.3364, + "Clarity": 2.5113, + "Naturalness": 2.2153 + }, + "3_33": { + "Coherence": 2.6268, + "Musicality": 2.71, + "Memorability": 2.6308, + "Clarity": 2.494, + "Naturalness": 2.5557 + }, + "4_30": { + "Coherence": 2.3005, + "Musicality": 2.1241, + "Memorability": 2.1483, + "Clarity": 2.1645, + "Naturalness": 2.1296 + }, + "3_3": { + "Coherence": 2.3968, + "Musicality": 2.142, + "Memorability": 2.127, + "Clarity": 2.1015, + "Naturalness": 2.1594 + }, + "3_36": { + "Coherence": 1.8789, + "Musicality": 1.9, + "Memorability": 1.9269, + "Clarity": 1.9324, + "Naturalness": 1.9133 + }, + "4_80": { + "Coherence": 1.9391, + "Musicality": 1.7574, + "Memorability": 1.7275, + "Clarity": 1.7538, + "Naturalness": 1.7564 + }, + "2_51": { + "Coherence": 2.3966, + "Musicality": 2.1859, + "Memorability": 2.1455, + "Clarity": 2.0988, + "Naturalness": 2.1257 + }, + "4_58": { + "Coherence": 2.1579, + "Musicality": 2.1347, + "Memorability": 2.0173, + "Clarity": 2.0455, + "Naturalness": 2.1905 + }, + "1_34": { + "Coherence": 2.3832, + "Musicality": 2.1407, + "Memorability": 2.2762, + "Clarity": 2.0385, + "Naturalness": 2.055 + }, + "0_36": { + "Coherence": 1.6852, + "Musicality": 1.6678, + "Memorability": 1.4983, + "Clarity": 1.5262, + "Naturalness": 1.5535 + }, + "1_49": { + "Coherence": 2.0158, + "Musicality": 2.041, + "Memorability": 1.825, + "Clarity": 1.9561, + "Naturalness": 1.996 + }, + "3_29": { + "Coherence": 2.5665, + "Musicality": 2.4987, + "Memorability": 2.4091, + "Clarity": 2.3353, + "Naturalness": 2.318 + }, + "2_38": { + "Coherence": 2.1383, + "Musicality": 2.2452, + "Memorability": 2.0626, + "Clarity": 1.9986, + "Naturalness": 2.0202 + }, + "3_8": { + "Coherence": 1.8991, + "Musicality": 1.8809, + "Memorability": 1.7575, + "Clarity": 1.8112, + "Naturalness": 1.8553 + }, + "1_70": { + "Coherence": 2.684, + "Musicality": 2.4419, + "Memorability": 2.4613, + "Clarity": 2.4446, + "Naturalness": 2.5297 + }, + "0_24": { + "Coherence": 2.11, + "Musicality": 1.9138, + "Memorability": 1.8638, + "Clarity": 1.9175, + "Naturalness": 2.0126 + }, + "0_83": { + "Coherence": 2.1425, + "Musicality": 1.9797, + "Memorability": 2.0508, + "Clarity": 1.917, + "Naturalness": 1.9763 + }, + "0_90": { + "Coherence": 2.1196, + "Musicality": 2.2561, + "Memorability": 2.0771, + "Clarity": 2.202, + "Naturalness": 2.1228 + }, + "2_43": { + "Coherence": 2.287, + "Musicality": 2.2181, + "Memorability": 2.1297, + "Clarity": 2.1293, + "Naturalness": 2.1854 + }, + "4_11": { + "Coherence": 1.6293, + "Musicality": 1.6067, + "Memorability": 1.4963, + "Clarity": 1.5133, + "Naturalness": 1.5644 + }, + "2_5": { + "Coherence": 2.2671, + "Musicality": 2.1869, + "Memorability": 2.0988, + "Clarity": 2.0693, + "Naturalness": 2.0278 + }, + "2_22": { + "Coherence": 2.1326, + "Musicality": 1.9773, + "Memorability": 2.0836, + "Clarity": 1.9685, + "Naturalness": 1.9751 + }, + "1_15": { + "Coherence": 2.6055, + "Musicality": 2.61, + "Memorability": 2.2391, + "Clarity": 2.3558, + "Naturalness": 2.3996 + }, + "2_66": { + "Coherence": 2.0179, + "Musicality": 2.0401, + "Memorability": 1.9842, + "Clarity": 1.9368, + "Naturalness": 2.0828 + }, + "1_83": { + "Coherence": 2.0647, + "Musicality": 1.9912, + "Memorability": 2.007, + "Clarity": 2.0035, + "Naturalness": 1.8946 + }, + "4_31": { + "Coherence": 1.7548, + "Musicality": 1.7506, + "Memorability": 1.7613, + "Clarity": 1.7323, + "Naturalness": 1.813 + }, + "0_35": { + "Coherence": 2.406, + "Musicality": 2.4684, + "Memorability": 2.3046, + "Clarity": 2.2827, + "Naturalness": 2.2514 + }, + "4_46": { + "Coherence": 1.735, + "Musicality": 1.5574, + "Memorability": 1.6575, + "Clarity": 1.594, + "Naturalness": 1.6847 + }, + "4_82": { + "Coherence": 2.0916, + "Musicality": 1.9722, + "Memorability": 1.9405, + "Clarity": 1.9748, + "Naturalness": 1.9839 + }, + "4_60": { + "Coherence": 1.9311, + "Musicality": 1.8396, + "Memorability": 1.8892, + "Clarity": 1.8481, + "Naturalness": 1.8399 + }, + "3_57": { + "Coherence": 2.308, + "Musicality": 2.2293, + "Memorability": 2.1903, + "Clarity": 2.029, + "Naturalness": 2.1156 + }, + "1_46": { + "Coherence": 2.6872, + "Musicality": 2.7337, + "Memorability": 2.4985, + "Clarity": 2.6359, + "Naturalness": 2.7108 + }, + "0_52": { + "Coherence": 2.5576, + "Musicality": 2.5189, + "Memorability": 2.2952, + "Clarity": 2.3614, + "Naturalness": 2.3503 + }, + "2_70": { + "Coherence": 1.947, + "Musicality": 1.8895, + "Memorability": 1.8951, + "Clarity": 1.8664, + "Naturalness": 2.0238 + }, + "3_12": { + "Coherence": 1.6705, + "Musicality": 1.6687, + "Memorability": 1.6058, + "Clarity": 1.6089, + "Naturalness": 1.7315 + }, + "1_13": { + "Coherence": 1.9944, + "Musicality": 1.8802, + "Memorability": 1.8915, + "Clarity": 1.8086, + "Naturalness": 1.9355 + }, + "0_69": { + "Coherence": 2.1448, + "Musicality": 2.2876, + "Memorability": 2.3752, + "Clarity": 2.1484, + "Naturalness": 2.1355 + }, + "2_96": { + "Coherence": 2.1742, + "Musicality": 2.0379, + "Memorability": 2.1654, + "Clarity": 1.967, + "Naturalness": 2.1188 + }, + "1_29": { + "Coherence": 2.0899, + "Musicality": 2.0926, + "Memorability": 1.955, + "Clarity": 1.8637, + "Naturalness": 2.1318 + }, + "0_23": { + "Coherence": 2.3541, + "Musicality": 2.3899, + "Memorability": 2.3275, + "Clarity": 2.2506, + "Naturalness": 2.3021 + }, + "3_75": { + "Coherence": 1.7439, + "Musicality": 1.7416, + "Memorability": 1.745, + "Clarity": 1.7009, + "Naturalness": 1.8192 + }, + "0_22": { + "Coherence": 2.2646, + "Musicality": 2.322, + "Memorability": 2.1101, + "Clarity": 2.2849, + "Naturalness": 2.2111 + }, + "4_14": { + "Coherence": 2.007, + "Musicality": 1.6769, + "Memorability": 1.7567, + "Clarity": 1.773, + "Naturalness": 1.8146 + }, + "4_83": { + "Coherence": 1.8591, + "Musicality": 1.8238, + "Memorability": 1.797, + "Clarity": 1.7371, + "Naturalness": 1.8122 + }, + "4_87": { + "Coherence": 1.5046, + "Musicality": 1.468, + "Memorability": 1.5036, + "Clarity": 1.4899, + "Naturalness": 1.54 + }, + "1_22": { + "Coherence": 2.0131, + "Musicality": 1.9433, + "Memorability": 2.026, + "Clarity": 1.9211, + "Naturalness": 1.9316 + }, + "0_9": { + "Coherence": 1.8929, + "Musicality": 2.0008, + "Memorability": 1.9504, + "Clarity": 1.9416, + "Naturalness": 1.881 + }, + "4_8": { + "Coherence": 2.1362, + "Musicality": 1.8801, + "Memorability": 1.8276, + "Clarity": 1.824, + "Naturalness": 1.8764 + }, + "4_35": { + "Coherence": 2.5386, + "Musicality": 2.6107, + "Memorability": 2.4252, + "Clarity": 2.4234, + "Naturalness": 2.5142 + }, + "0_56": { + "Coherence": 2.1514, + "Musicality": 2.223, + "Memorability": 2.1618, + "Clarity": 2.0111, + "Naturalness": 2.0403 + }, + "2_46": { + "Coherence": 2.1267, + "Musicality": 2.2684, + "Memorability": 2.014, + "Clarity": 2.0737, + "Naturalness": 2.1822 + }, + "0_42": { + "Coherence": 2.2165, + "Musicality": 1.9245, + "Memorability": 2.0194, + "Clarity": 2.0125, + "Naturalness": 2.1712 + }, + "0_17": { + "Coherence": 1.9866, + "Musicality": 2.2045, + "Memorability": 1.8497, + "Clarity": 2.0364, + "Naturalness": 2.1013 + }, + "2_84": { + "Coherence": 2.0135, + "Musicality": 1.8381, + "Memorability": 1.9097, + "Clarity": 1.8395, + "Naturalness": 1.9822 + }, + "3_23": { + "Coherence": 1.627, + "Musicality": 1.8459, + "Memorability": 1.6816, + "Clarity": 1.6436, + "Naturalness": 1.7349 + }, + "2_27": { + "Coherence": 1.8048, + "Musicality": 1.8331, + "Memorability": 1.8633, + "Clarity": 1.8186, + "Naturalness": 1.9587 + }, + "4_13": { + "Coherence": 2.0373, + "Musicality": 2.0325, + "Memorability": 1.8568, + "Clarity": 1.956, + "Naturalness": 1.9767 + }, + "3_21": { + "Coherence": 2.0607, + "Musicality": 1.9899, + "Memorability": 2.0849, + "Clarity": 1.9912, + "Naturalness": 2.1108 + }, + "2_17": { + "Coherence": 2.2321, + "Musicality": 1.8576, + "Memorability": 1.8561, + "Clarity": 1.9274, + "Naturalness": 1.9511 + }, + "1_31": { + "Coherence": 2.4058, + "Musicality": 2.363, + "Memorability": 2.3252, + "Clarity": 2.2802, + "Naturalness": 2.3214 + }, + "2_99": { + "Coherence": 2.199, + "Musicality": 2.2032, + "Memorability": 2.181, + "Clarity": 2.2392, + "Naturalness": 2.1879 + }, + "1_19": { + "Coherence": 1.76, + "Musicality": 1.7322, + "Memorability": 1.7219, + "Clarity": 1.7162, + "Naturalness": 1.7226 + }, + "1_35": { + "Coherence": 2.8663, + "Musicality": 2.8794, + "Memorability": 2.8437, + "Clarity": 2.82, + "Naturalness": 2.7089 + }, + "4_15": { + "Coherence": 1.5884, + "Musicality": 1.6912, + "Memorability": 1.5345, + "Clarity": 1.5451, + "Naturalness": 1.5748 + }, + "4_98": { + "Coherence": 2.5377, + "Musicality": 2.4452, + "Memorability": 2.3973, + "Clarity": 2.4145, + "Naturalness": 2.3078 + }, + "4_97": { + "Coherence": 2.0159, + "Musicality": 1.9679, + "Memorability": 1.8909, + "Clarity": 1.8561, + "Naturalness": 1.9376 + }, + "3_62": { + "Coherence": 2.2726, + "Musicality": 2.1243, + "Memorability": 1.9979, + "Clarity": 2.0129, + "Naturalness": 2.169 + }, + "3_47": { + "Coherence": 1.9621, + "Musicality": 1.8414, + "Memorability": 1.7679, + "Clarity": 1.7602, + "Naturalness": 1.8394 + }, + "4_44": { + "Coherence": 1.9389, + "Musicality": 2.0183, + "Memorability": 1.9027, + "Clarity": 1.7985, + "Naturalness": 1.8496 + }, + "4_26": { + "Coherence": 2.3804, + "Musicality": 2.2318, + "Memorability": 2.2971, + "Clarity": 2.3846, + "Naturalness": 2.3424 + }, + "0_96": { + "Coherence": 2.1998, + "Musicality": 2.0293, + "Memorability": 2.1546, + "Clarity": 1.9797, + "Naturalness": 2.0439 + }, + "2_19": { + "Coherence": 2.6319, + "Musicality": 2.6034, + "Memorability": 2.674, + "Clarity": 2.4835, + "Naturalness": 2.5341 + }, + "3_34": { + "Coherence": 2.1468, + "Musicality": 2.0713, + "Memorability": 2.0913, + "Clarity": 2.0615, + "Naturalness": 2.1338 + }, + "3_53": { + "Coherence": 2.134, + "Musicality": 2.1984, + "Memorability": 2.235, + "Clarity": 2.1576, + "Naturalness": 2.1941 + }, + "1_4": { + "Coherence": 2.3963, + "Musicality": 2.1941, + "Memorability": 2.2128, + "Clarity": 2.2107, + "Naturalness": 2.1844 + }, + "4_96": { + "Coherence": 2.2618, + "Musicality": 2.232, + "Memorability": 2.2834, + "Clarity": 2.1766, + "Naturalness": 2.175 + }, + "4_67": { + "Coherence": 2.1472, + "Musicality": 2.2276, + "Memorability": 1.9249, + "Clarity": 2.1189, + "Naturalness": 2.0585 + }, + "3_18": { + "Coherence": 2.2926, + "Musicality": 2.0919, + "Memorability": 1.9636, + "Clarity": 2.1313, + "Naturalness": 2.0226 + }, + "2_34": { + "Coherence": 1.8336, + "Musicality": 1.7908, + "Memorability": 1.8167, + "Clarity": 1.8019, + "Naturalness": 1.7783 + }, + "1_33": { + "Coherence": 1.9092, + "Musicality": 1.8481, + "Memorability": 1.899, + "Clarity": 1.8472, + "Naturalness": 1.8423 + }, + "1_14": { + "Coherence": 2.0287, + "Musicality": 1.6931, + "Memorability": 1.8084, + "Clarity": 1.8911, + "Naturalness": 1.8841 + }, + "4_45": { + "Coherence": 2.2605, + "Musicality": 2.3191, + "Memorability": 1.9416, + "Clarity": 2.2267, + "Naturalness": 2.0908 + }, + "4_51": { + "Coherence": 2.3854, + "Musicality": 2.3819, + "Memorability": 2.3657, + "Clarity": 2.2707, + "Naturalness": 2.356 + }, + "0_20": { + "Coherence": 2.4207, + "Musicality": 2.1803, + "Memorability": 2.3646, + "Clarity": 2.3664, + "Naturalness": 2.2901 + }, + "1_90": { + "Coherence": 2.1604, + "Musicality": 1.9937, + "Memorability": 1.9726, + "Clarity": 2.0761, + "Naturalness": 2.0193 + }, + "3_6": { + "Coherence": 2.0842, + "Musicality": 2.0169, + "Memorability": 1.9929, + "Clarity": 1.9499, + "Naturalness": 1.9548 + }, + "3_68": { + "Coherence": 2.1774, + "Musicality": 2.1836, + "Memorability": 2.2703, + "Clarity": 2.0859, + "Naturalness": 2.1572 + }, + "4_64": { + "Coherence": 2.2637, + "Musicality": 2.1571, + "Memorability": 2.1312, + "Clarity": 2.1039, + "Naturalness": 2.1308 + }, + "1_7": { + "Coherence": 2.127, + "Musicality": 2.1189, + "Memorability": 2.101, + "Clarity": 2.0108, + "Naturalness": 2.0466 + }, + "2_56": { + "Coherence": 2.255, + "Musicality": 2.2398, + "Memorability": 2.0856, + "Clarity": 2.2421, + "Naturalness": 2.0982 + }, + "2_64": { + "Coherence": 2.3961, + "Musicality": 2.3965, + "Memorability": 2.3291, + "Clarity": 2.2876, + "Naturalness": 2.3993 + }, + "4_2": { + "Coherence": 2.0556, + "Musicality": 2.0101, + "Memorability": 1.9861, + "Clarity": 1.9554, + "Naturalness": 1.8409 + }, + "0_48": { + "Coherence": 1.6236, + "Musicality": 1.5744, + "Memorability": 1.613, + "Clarity": 1.654, + "Naturalness": 1.5972 + }, + "4_33": { + "Coherence": 2.3151, + "Musicality": 2.311, + "Memorability": 2.0714, + "Clarity": 2.1852, + "Naturalness": 2.2574 + }, + "3_59": { + "Coherence": 2.2179, + "Musicality": 2.1151, + "Memorability": 2.1836, + "Clarity": 2.1124, + "Naturalness": 2.1609 + }, + "1_30": { + "Coherence": 2.3185, + "Musicality": 2.2906, + "Memorability": 2.2103, + "Clarity": 2.1885, + "Naturalness": 2.3091 + }, + "2_87": { + "Coherence": 2.2864, + "Musicality": 2.0823, + "Memorability": 2.132, + "Clarity": 2.1362, + "Naturalness": 2.1958 + }, + "1_27": { + "Coherence": 2.2529, + "Musicality": 2.2283, + "Memorability": 2.1236, + "Clarity": 2.2069, + "Naturalness": 2.1388 + }, + "1_50": { + "Coherence": 2.3244, + "Musicality": 2.3636, + "Memorability": 2.2035, + "Clarity": 2.2259, + "Naturalness": 2.2104 + }, + "4_54": { + "Coherence": 1.5802, + "Musicality": 1.6693, + "Memorability": 1.7025, + "Clarity": 1.6213, + "Naturalness": 1.7422 + }, + "3_38": { + "Coherence": 2.149, + "Musicality": 1.9559, + "Memorability": 1.8837, + "Clarity": 2.0031, + "Naturalness": 2.1194 + }, + "3_60": { + "Coherence": 2.1493, + "Musicality": 2.1146, + "Memorability": 1.8566, + "Clarity": 2.0343, + "Naturalness": 2.0217 + }, + "0_80": { + "Coherence": 2.2452, + "Musicality": 2.0877, + "Memorability": 1.9421, + "Clarity": 2.0476, + "Naturalness": 2.0674 + }, + "0_10": { + "Coherence": 2.3701, + "Musicality": 2.1845, + "Memorability": 2.1567, + "Clarity": 2.0152, + "Naturalness": 2.2248 + }, + "4_37": { + "Coherence": 1.8832, + "Musicality": 1.7516, + "Memorability": 1.8802, + "Clarity": 1.9111, + "Naturalness": 1.9214 + }, + "2_47": { + "Coherence": 2.2542, + "Musicality": 2.4492, + "Memorability": 2.208, + "Clarity": 2.2574, + "Naturalness": 2.2519 + }, + "0_84": { + "Coherence": 2.0714, + "Musicality": 1.9126, + "Memorability": 2.0429, + "Clarity": 1.961, + "Naturalness": 2.0466 + }, + "2_98": { + "Coherence": 1.8055, + "Musicality": 1.8217, + "Memorability": 1.7747, + "Clarity": 1.6343, + "Naturalness": 1.6715 + }, + "1_26": { + "Coherence": 1.9664, + "Musicality": 1.95, + "Memorability": 1.9236, + "Clarity": 1.8303, + "Naturalness": 1.8775 + }, + "4_50": { + "Coherence": 1.8427, + "Musicality": 1.73, + "Memorability": 1.6585, + "Clarity": 1.7139, + "Naturalness": 1.6576 + }, + "0_0": { + "Coherence": 2.0397, + "Musicality": 1.9672, + "Memorability": 1.946, + "Clarity": 1.8068, + "Naturalness": 1.9073 + }, + "4_47": { + "Coherence": 1.8557, + "Musicality": 1.8835, + "Memorability": 1.837, + "Clarity": 1.7625, + "Naturalness": 1.9051 + }, + "2_13": { + "Coherence": 1.6357, + "Musicality": 1.5882, + "Memorability": 1.6177, + "Clarity": 1.4985, + "Naturalness": 1.5918 + }, + "4_36": { + "Coherence": 1.9727, + "Musicality": 1.8362, + "Memorability": 1.9173, + "Clarity": 1.8182, + "Naturalness": 1.9177 + }, + "4_27": { + "Coherence": 1.5064, + "Musicality": 1.5201, + "Memorability": 1.5053, + "Clarity": 1.5035, + "Naturalness": 1.4923 + }, + "4_9": { + "Coherence": 2.1439, + "Musicality": 2.1297, + "Memorability": 2.0104, + "Clarity": 2.0009, + "Naturalness": 2.04 + }, + "1_89": { + "Coherence": 2.0242, + "Musicality": 2.0556, + "Memorability": 1.9212, + "Clarity": 1.8167, + "Naturalness": 2.0016 + }, + "1_73": { + "Coherence": 2.1793, + "Musicality": 2.2498, + "Memorability": 2.1285, + "Clarity": 2.1383, + "Naturalness": 2.1396 + }, + "2_86": { + "Coherence": 2.3578, + "Musicality": 2.2023, + "Memorability": 2.1193, + "Clarity": 2.2041, + "Naturalness": 2.1863 + }, + "3_39": { + "Coherence": 2.4611, + "Musicality": 2.3943, + "Memorability": 2.3378, + "Clarity": 2.3163, + "Naturalness": 2.3382 + }, + "0_29": { + "Coherence": 2.346, + "Musicality": 2.1596, + "Memorability": 2.1985, + "Clarity": 2.295, + "Naturalness": 2.1203 + }, + "1_18": { + "Coherence": 1.8233, + "Musicality": 1.7384, + "Memorability": 1.6232, + "Clarity": 1.6176, + "Naturalness": 1.7529 + }, + "3_7": { + "Coherence": 2.4063, + "Musicality": 2.4915, + "Memorability": 2.2263, + "Clarity": 2.4099, + "Naturalness": 2.3619 + }, + "1_17": { + "Coherence": 2.4014, + "Musicality": 2.2059, + "Memorability": 2.187, + "Clarity": 2.1376, + "Naturalness": 2.1224 + }, + "4_41": { + "Coherence": 1.914, + "Musicality": 1.9452, + "Memorability": 1.8625, + "Clarity": 1.9696, + "Naturalness": 1.9795 + }, + "1_61": { + "Coherence": 2.006, + "Musicality": 1.9873, + "Memorability": 1.9519, + "Clarity": 1.8659, + "Naturalness": 1.9498 + }, + "3_13": { + "Coherence": 2.3618, + "Musicality": 2.3475, + "Memorability": 2.2027, + "Clarity": 2.2215, + "Naturalness": 2.3608 + }, + "3_9": { + "Coherence": 2.385, + "Musicality": 2.3741, + "Memorability": 2.3, + "Clarity": 2.3678, + "Naturalness": 2.4543 + }, + "2_30": { + "Coherence": 2.0684, + "Musicality": 1.8582, + "Memorability": 2.06, + "Clarity": 1.9585, + "Naturalness": 2.0117 + }, + "2_53": { + "Coherence": 1.9078, + "Musicality": 1.8967, + "Memorability": 1.7585, + "Clarity": 2.0203, + "Naturalness": 1.8391 + }, + "2_41": { + "Coherence": 1.8686, + "Musicality": 1.9613, + "Memorability": 1.804, + "Clarity": 1.8032, + "Naturalness": 1.861 + }, + "1_81": { + "Coherence": 2.2679, + "Musicality": 2.1949, + "Memorability": 2.1999, + "Clarity": 2.0734, + "Naturalness": 2.2605 + }, + "3_98": { + "Coherence": 2.3907, + "Musicality": 2.4633, + "Memorability": 2.1563, + "Clarity": 2.161, + "Naturalness": 2.4192 + }, + "2_89": { + "Coherence": 2.0126, + "Musicality": 2.0098, + "Memorability": 1.9861, + "Clarity": 1.9218, + "Naturalness": 1.9113 + }, + "4_28": { + "Coherence": 2.0495, + "Musicality": 1.8543, + "Memorability": 1.8595, + "Clarity": 1.8491, + "Naturalness": 1.9354 + }, + "2_12": { + "Coherence": 2.2923, + "Musicality": 2.3921, + "Memorability": 2.2181, + "Clarity": 2.1807, + "Naturalness": 2.1864 + }, + "1_3": { + "Coherence": 1.6278, + "Musicality": 1.7068, + "Memorability": 1.6259, + "Clarity": 1.7255, + "Naturalness": 1.7471 + }, + "3_78": { + "Coherence": 2.9628, + "Musicality": 2.8431, + "Memorability": 2.654, + "Clarity": 2.6517, + "Naturalness": 2.7332 + }, + "3_87": { + "Coherence": 1.8677, + "Musicality": 1.847, + "Memorability": 1.966, + "Clarity": 1.7626, + "Naturalness": 1.8631 + }, + "0_55": { + "Coherence": 1.9595, + "Musicality": 1.8839, + "Memorability": 1.8605, + "Clarity": 1.8872, + "Naturalness": 1.8648 + }, + "1_79": { + "Coherence": 2.3852, + "Musicality": 2.3242, + "Memorability": 2.2843, + "Clarity": 2.1605, + "Naturalness": 2.2595 + }, + "3_19": { + "Coherence": 2.2715, + "Musicality": 2.2328, + "Memorability": 2.2698, + "Clarity": 2.2219, + "Naturalness": 2.3195 + }, + "2_8": { + "Coherence": 2.4697, + "Musicality": 2.2947, + "Memorability": 2.56, + "Clarity": 2.3589, + "Naturalness": 2.2362 + }, + "1_44": { + "Coherence": 2.3735, + "Musicality": 2.4384, + "Memorability": 2.2973, + "Clarity": 2.2148, + "Naturalness": 2.2393 + }, + "2_1": { + "Coherence": 2.4573, + "Musicality": 2.4089, + "Memorability": 2.4131, + "Clarity": 2.2625, + "Naturalness": 2.4246 + }, + "0_6": { + "Coherence": 2.3034, + "Musicality": 2.073, + "Memorability": 2.0064, + "Clarity": 1.9674, + "Naturalness": 2.126 + }, + "4_24": { + "Coherence": 2.065, + "Musicality": 2.1859, + "Memorability": 2.0134, + "Clarity": 1.9378, + "Naturalness": 2.0556 + }, + "2_61": { + "Coherence": 2.232, + "Musicality": 1.9937, + "Memorability": 2.0872, + "Clarity": 1.9703, + "Naturalness": 1.9535 + }, + "1_25": { + "Coherence": 2.2357, + "Musicality": 2.2056, + "Memorability": 2.2286, + "Clarity": 1.9696, + "Naturalness": 2.0042 + }, + "1_60": { + "Coherence": 1.9664, + "Musicality": 2.0033, + "Memorability": 1.9511, + "Clarity": 1.875, + "Naturalness": 1.9356 + }, + "1_36": { + "Coherence": 2.2049, + "Musicality": 2.385, + "Memorability": 2.263, + "Clarity": 2.164, + "Naturalness": 2.0728 + }, + "1_63": { + "Coherence": 1.8495, + "Musicality": 1.8794, + "Memorability": 1.9, + "Clarity": 1.8604, + "Naturalness": 1.8717 + }, + "2_21": { + "Coherence": 2.3975, + "Musicality": 2.256, + "Memorability": 2.204, + "Clarity": 2.1861, + "Naturalness": 2.32 + }, + "4_85": { + "Coherence": 2.5726, + "Musicality": 2.2728, + "Memorability": 2.3922, + "Clarity": 2.3106, + "Naturalness": 2.3212 + }, + "2_59": { + "Coherence": 2.2687, + "Musicality": 2.3734, + "Memorability": 2.1968, + "Clarity": 2.0332, + "Naturalness": 2.2547 + }, + "1_94": { + "Coherence": 2.1387, + "Musicality": 2.1505, + "Memorability": 2.3135, + "Clarity": 2.0918, + "Naturalness": 2.2138 + }, + "0_45": { + "Coherence": 2.349, + "Musicality": 2.1916, + "Memorability": 2.22, + "Clarity": 2.1934, + "Naturalness": 2.143 + }, + "3_85": { + "Coherence": 2.3207, + "Musicality": 2.3741, + "Memorability": 2.2619, + "Clarity": 2.2145, + "Naturalness": 2.1553 + }, + "1_40": { + "Coherence": 2.2562, + "Musicality": 2.3165, + "Memorability": 2.4133, + "Clarity": 2.2157, + "Naturalness": 2.2307 + }, + "1_47": { + "Coherence": 1.7885, + "Musicality": 1.7783, + "Memorability": 1.8075, + "Clarity": 1.6714, + "Naturalness": 1.8082 + }, + "average": { + "Coherence": 2.1767, + "Musicality": 2.1267, + "Memorability": 2.0738, + "Clarity": 2.0536, + "Naturalness": 2.0858 + } +} \ No newline at end of file diff --git a/assets/amadeus-framwork.drawio.png b/assets/amadeus-framwork.drawio.png new file mode 100644 index 0000000..df6213c Binary files /dev/null and b/assets/amadeus-framwork.drawio.png differ diff --git a/assets/exp_amadeus.mp3 b/assets/exp_amadeus.mp3 new file mode 100644 index 0000000..a0fd3d9 Binary files /dev/null and b/assets/exp_amadeus.mp3 differ diff --git a/assets/inference.drawio.png b/assets/inference.drawio.png new file mode 100644 index 0000000..9ee7d16 Binary files /dev/null and b/assets/inference.drawio.png differ diff --git a/assets/merged_compare_clean.png b/assets/merged_compare_clean.png new file mode 100644 index 0000000..5501196 Binary files /dev/null and b/assets/merged_compare_clean.png differ diff --git a/data_representation/README.md b/data_representation/README.md new file mode 100644 index 0000000..ccfae12 --- /dev/null +++ b/data_representation/README.md @@ -0,0 +1,81 @@ +# Dataset Download + +Our model supports four different datasets: + +- **Symbolic Orchestral Database (SOD)**: [Link](https://qsdfo.github.io/LOP/database.html) +- **Lakh MIDI Dataset (Clean version)**: [Link](https://colinraffel.com/projects/lmd/) +- **Pop1k7**: [Link](https://github.com/YatingMusic/compound-word-transformer) +- **Pop909**: [Link](https://github.com/music-x-lab/POP909-Dataset) + +### Download Instructions + +You can download the datasets via the command line: + +```sh +# SOD +wget https://qsdfo.github.io/LOP/database/SOD.zip + +# LakhClean +wget http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz +``` + +For Pop1k7, the official repository link is currently unavailable. However, you can download it from this Google Drive link: +[Download Pop1k7](https://drive.google.com/file/d/1GnbELjE-kQ4WOkBmZ3XapFKIaltySRyV/view?usp=drive_link) + +For Pop909, the dataset is uploaded in the official Github repository: [Repository link](https://github.com/music-x-lab/POP909-Dataset) + +### Using Your Own Dataset +If you plan to use your own dataset, you can modify the dataset class in the data_utils.py script under symbolic_encoding folder inside the nested_music_transformer folder. Alternatively, for a simpler approach, rename your dataset to match one of the following options: + +- SOD: Use this for score-based MIDI datasets that require finer-grained quantization (supports up to 16th note triplet level quantization; 24 samples per quarter note). +- LakhClean: Suitable for score-based MIDI datasets requiring coarse-grained quantization (supports up to 16th note level quantization; 4 samples per quarter note). +- Pop1k7, Pop909: Ideal for expressive-based MIDI datasets requiring coarse-grained quantization (supports up to 16th note level quantization; 4 samples per quarter note). + +# Data Representation + +

+ +

+ + +This document outlines our standard data processing pipeline. By following the instructions and running the corresponding Python scripts, you can generate a data representation suited to your specific needs. + +We focus on symbolic music and limit the use of musical features to a select few. Each feature set size corresponds to specific musical attributes. Through various experiments, we decided to use **7 features** for the *Pop1k7* and *Pop909* datasets, which consist of pop piano music requiring velocity for expression, and **5 features** for the *Symbolic Orchestral Database (SOD)*, *Lakh MIDI*, and *SymphonyMIDI* datasets. + +- **4 features**: `["type", "beat", "pitch", "duration"]` +- **5 features**: `["type", "beat", "instrument", "pitch", "duration"]` +- **7 features**: `["type", "beat", "chord", "tempo", "pitch", "duration", "velocity"]` +- **8 features**: `["type", "beat", "chord", "tempo", "instrument", "pitch", "duration", "velocity"]` + +## Parse Argument +- `-d`, `--dataset`: This required argument specifies the dataset to be used. It takes one of the following values: `"BachChorale"`, `"Pop1k7"`, `"Pop909"`, `"SOD"`, `"LakhClean"`, or `"SymphonyMIDI"`. + +- `-e`, `--encoding`: This required argument specifies the encoding scheme to use. It accepts one of the following: `"remi"`, `"cp"`, `"nb"`, or `"remi_pos"`. + +- `-f`, `--num_features`: This required argument specifies the number of features. It can take one of the following values: `4`, `5`, `7`, or `8`. + +- `-i`, `--in_dir`: This optional argument specifies the input data directory. It defaults to `../dataset/represented_data/corpus/` if not provided. + +- `-o`, `--out_dir`: This optional argument specifies the output data directory. It defaults to `../dataset/represented_data/events/`. + +- `--debug`: This flag enables debug mode when included. No additional value is needed. + +## 1. MIDI to Corpus +In this step, we convert MIDI files into a set of events containing various musical information. The MIDI files should be aligned with the beat and contain accurate time signature information. Place the MIDI files in `` and refer to the example files provided. Navigate to the `` folder and run the script. The converted data will be stored in ``. + +- Example usage: `python3 step1_midi2corpus.py --dataset SOD --num_features 5` + +## 2. Corpus to Event +We provide three types of representations: **REMI**, **Compound Word (CP)**, and **Note-based Encoding (NB)**. The converted data will be stored in ``. + +- Example usage: `python3 step2_corpus2event.py --dataset SOD --num_features 5 --encoding nb` + +## 3. Creating Vocabulary +This script creates a vocabulary in the `` folder. The vocabulary includes event-to-index pair information. + +- Example usage: `python3 step3_creating_vocab.py --dataset SOD --num_features 5 --encoding nb` + +## 4. Event to Index +In this step, we convert events into indices for efficient model training. The converted data will be stored in ``. + +- Example usage: `python3 step4_event2tuneidx.py --dataset SOD --num_features 5 --encoding nb` diff --git a/data_representation/__init__.py b/data_representation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_representation/__pycache__/__init__.cpython-310.pyc b/data_representation/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..417fb88 Binary files /dev/null and b/data_representation/__pycache__/__init__.cpython-310.pyc differ diff --git a/data_representation/__pycache__/constants.cpython-310.pyc b/data_representation/__pycache__/constants.cpython-310.pyc new file mode 100644 index 0000000..a939e7a Binary files /dev/null and b/data_representation/__pycache__/constants.cpython-310.pyc differ diff --git a/data_representation/__pycache__/vocab_utils.cpython-310.pyc b/data_representation/__pycache__/vocab_utils.cpython-310.pyc new file mode 100644 index 0000000..fdcdf55 Binary files /dev/null and b/data_representation/__pycache__/vocab_utils.cpython-310.pyc differ diff --git a/data_representation/constants.py b/data_representation/constants.py new file mode 100644 index 0000000..dea8b21 --- /dev/null +++ b/data_representation/constants.py @@ -0,0 +1,422 @@ +import numpy as np + +# for chord analysis +NUM2PITCH = { + 0: 'C', + 1: 'C#', + 2: 'D', + 3: 'D#', + 4: 'E', + 5: 'F', + 6: 'F#', + 7: 'G', + 8: 'G#', + 9: 'A', + 10: 'A#', + 11: 'B', +} + +# referred to mmt "https://github.com/salu133445/mmt" +PROGRAM_INSTRUMENT_MAP = { + # Pianos + 0: "piano", + 1: "piano", + 2: "piano", + 3: "piano", + 4: "electric-piano", + 5: "electric-piano", + 6: "harpsichord", + 7: "clavinet", + # Chromatic Percussion + 8: "celesta", + 9: "glockenspiel", + 10: "music-box", + 11: "vibraphone", + 12: "marimba", + 13: "xylophone", + 14: "tubular-bells", + 15: "dulcimer", + # Organs + 16: "organ", + 17: "organ", + 18: "organ", + 19: "church-organ", + 20: "organ", + 21: "accordion", + 22: "harmonica", + 23: "bandoneon", + # Guitars + 24: "nylon-string-guitar", + 25: "steel-string-guitar", + 26: "electric-guitar", + 27: "electric-guitar", + 28: "electric-guitar", + 29: "electric-guitar", + 30: "electric-guitar", + 31: "electric-guitar", + # Basses + 32: "bass", + 33: "electric-bass", + 34: "electric-bass", + 35: "electric-bass", + 36: "slap-bass", + 37: "slap-bass", + 38: "synth-bass", + 39: "synth-bass", + # Strings + 40: "violin", + 41: "viola", + 42: "cello", + 43: "contrabass", + 44: "strings", + 45: "strings", + 46: "harp", + 47: "timpani", + # Ensemble + 48: "strings", + 49: "strings", + 50: "synth-strings", + 51: "synth-strings", + 52: "voices", + 53: "voices", + 54: "voices", + 55: "orchestra-hit", + # Brass + 56: "trumpet", + 57: "trombone", + 58: "tuba", + 59: "trumpet", + 60: "horn", + 61: "brasses", + 62: "synth-brasses", + 63: "synth-brasses", + # Reed + 64: "soprano-saxophone", + 65: "alto-saxophone", + 66: "tenor-saxophone", + 67: "baritone-saxophone", + 68: "oboe", + 69: "english-horn", + 70: "bassoon", + 71: "clarinet", + # Pipe + 72: "piccolo", + 73: "flute", + 74: "recorder", + 75: "pan-flute", + 76: None, + 77: None, + 78: None, + 79: "ocarina", + # Synth Lead + 80: "lead", + 81: "lead", + 82: "lead", + 83: "lead", + 84: "lead", + 85: "lead", + 86: "lead", + 87: "lead", + # Synth Pad + 88: "pad", + 89: "pad", + 90: "pad", + 91: "pad", + 92: "pad", + 93: "pad", + 94: "pad", + 95: "pad", + # Synth Effects + 96: None, + 97: None, + 98: None, + 99: None, + 100: None, + 101: None, + 102: None, + 103: None, + # Ethnic + 104: "sitar", + 105: "banjo", + 106: "shamisen", + 107: "koto", + 108: "kalimba", + 109: "bag-pipe", + 110: "violin", + 111: "shehnai", + # Percussive + 112: None, + 113: None, + 114: "steel-drums", + 115: None, + 116: None, + 117: "melodic-tom", + 118: "synth-drums", + 119: "synth-drums", + # Sound effects + 120: None, + 121: None, + 122: None, + 123: None, + 124: None, + 125: None, + 126: None, + 127: None, +} + +# referred to mmt "https://github.com/salu133445/mmt" +INSTRUMENT_PROGRAM_MAP = { + # Pianos + "piano": 0, + "electric-piano": 4, + "harpsichord": 6, + "clavinet": 7, + # Chromatic Percussion + "celesta": 8, + "glockenspiel": 9, + "music-box": 10, + "vibraphone": 11, + "marimba": 12, + "xylophone": 13, + "tubular-bells": 14, + "dulcimer": 15, + # Organs + "organ": 16, + "church-organ": 19, + "accordion": 21, + "harmonica": 22, + "bandoneon": 23, + # Guitars + "nylon-string-guitar": 24, + "steel-string-guitar": 25, + "electric-guitar": 26, + # Basses + "bass": 32, + "electric-bass": 33, + "slap-bass": 36, + "synth-bass": 38, + # Strings + "violin": 40, + "viola": 41, + "cello": 42, + "contrabass": 43, + "harp": 46, + "timpani": 47, + # Ensemble + "strings": 49, + "synth-strings": 50, + "voices": 52, + "orchestra-hit": 55, + # Brass + "trumpet": 56, + "trombone": 57, + "tuba": 58, + "horn": 60, + "brasses": 61, + "synth-brasses": 62, + # Reed + "soprano-saxophone": 64, + "alto-saxophone": 65, + "tenor-saxophone": 66, + "baritone-saxophone": 67, + "oboe": 68, + "english-horn": 69, + "bassoon": 70, + "clarinet": 71, + # Pipe + "piccolo": 72, + "flute": 73, + "recorder": 74, + "pan-flute": 75, + "ocarina": 79, + # Synth Lead + "lead": 80, + # Synth Pad + "pad": 88, + # Ethnic + "sitar": 104, + "banjo": 105, + "shamisen": 106, + "koto": 107, + "kalimba": 108, + "bag-pipe": 109, + "shehnai": 111, + # Percussive + "steel-drums": 114, + "melodic-tom": 117, + "synth-drums": 118, +} + +FINED_PROGRAM_INSTRUMENT_MAP ={ + # Pianos + 0: "Acoustic-Grand-Piano", + 1: "Bright-Acoustic-Piano", + 2: "Electric-Grand-Piano", + 3: "Honky-Tonk-Piano", + 4: "Electric-Piano-1", + 5: "Electric-Piano-2", + 6: "Harpsichord", + 7: "Clavinet", + + # Chromatic Percussion + 8: "Celesta", + 9: "Glockenspiel", + 10: "Music-Box", + 11: "Vibraphone", + 12: "Marimba", + 13: "Xylophone", + 14: "Tubular-Bells", + 15: "Dulcimer", + + # Organs + 16: "Drawbar-Organ", + 17: "Percussive-Organ", + 18: "Rock-Organ", + 19: "Church-Organ", + 20: "Reed-Organ", + 21: "Accordion", + 22: "Harmonica", + 23: "Tango-Accordion", + + # Guitars + 24: "Acoustic-Guitar-nylon", + 25: "Acoustic-Guitar-steel", + 26: "Electric-Guitar-jazz", + 27: "Electric-Guitar-clean", + 28: "Electric-Guitar-muted", + 29: "Overdriven-Guitar", + 30: "Distortion-Guitar", + 31: "Guitar-harmonics", + + # Basses + 32: "Acoustic-Bass", + 33: "Electric-Bass-finger", + 34: "Electric-Bass-pick", + 35: "Fretless-Bass", + 36: "Slap-Bass-1", + 37: "Slap-Bass-2", + 38: "Synth-Bass-1", + 39: "Synth-Bass-2", + + # Strings & Orchestral + 40: "Violin", + 41: "Viola", + 42: "Cello", + 43: "Contrabass", + 44: "Tremolo-Strings", + 45: "Pizzicato-Strings", + 46: "Orchestral-Harp", + 47: "Timpani", + + # Ensemble + 48: "String-Ensemble-1", + 49: "String-Ensemble-2", + 50: "Synth-Strings-1", + 51: "Synth-Strings-2", + 52: "Choir-Aahs", + 53: "Voice-Oohs", + 54: "Synth-Voice", + 55: "Orchestra-Hit", + + # Brass + 56: "Trumpet", + 57: "Trombone", + 58: "Tuba", + 59: "Muted-Trumpet", + 60: "French-Horn", + 61: "Brass-Section", + 62: "Synth-Brass-1", + 63: "Synth-Brass-2", + + # Reeds + 64: "Soprano-Sax", + 65: "Alto-Sax", + 66: "Tenor-Sax", + 67: "Baritone-Sax", + 68: "Oboe", + 69: "English-Horn", + 70: "Bassoon", + 71: "Clarinet", + + # Pipes + 72: "Piccolo", + 73: "Flute", + 74: "Recorder", + 75: "Pan-Flute", + 76: "Blown-Bottle", + 77: "Shakuhachi", + 78: "Whistle", + 79: "Ocarina", + + # Synth Lead + 80: "Lead-1-square", + 81: "Lead-2-sawtooth", + 82: "Lead-3-calliope", + 83: "Lead-4-chiff", + 84: "Lead-5-charang", + 85: "Lead-6-voice", + 86: "Lead-7-fifths", + 87: "Lead-8-bass+lead", + + # Synth Pad + 88: "Pad-1-new-age", + 89: "Pad-2-warm", + 90: "Pad-3-polysynth", + 91: "Pad-4-choir", + 92: "Pad-5-bowed", + 93: "Pad-6-metallic", + 94: "Pad-7-halo", + 95: "Pad-8-sweep", + + # Effects + 96: "FX-1-rain", + 97: "FX-2-soundtrack", + 98: "FX-3-crystal", + 99: "FX-4-atmosphere", + 100: "FX-5-brightness", + 101: "FX-6-goblins", + 102: "FX-7-echoes", + 103: "FX-8-sci-fi", + + # Ethnic & Percussion + 104: "Sitar", + 105: "Banjo", + 106: "Shamisen", + 107: "Koto", + 108: "Kalimba", + 109: "Bag-pipe", + 110: "Fiddle", + 111: "Shanai", + + # Percussive + 112: "Tinkle-Bell", + 113: "Agogo", + 114: "Steel-Drums", + 115: "Woodblock", + 116: "Taiko-Drum", + 117: "Melodic-Tom", + 118: "Synth-Drum", + 119: "Reverse-Cymbal", + + # Sound Effects + 120: "Guitar-Fret-Noise", + 121: "Breath-Noise", + 122: "Seashore", + 123: "Bird-Tweet", + 124: "Telephone-Ring", + 125: "Helicopter", + 126: "Applause", + 127: "Gunshot" +} + + +REGULAR_NUM_DENOM = [(1, 1), (1, 2), (2, 2), (3, 2), (4, 2), + (1, 4), (2, 4), (3, 4), (4, 4), (5, 4), (6, 4), (7, 4), (8, 4), + (1, 8), (2, 8), (3, 8), (4, 8), (5, 8), (6, 8), (7, 8), (8, 8), (9, 8), (11, 8), (12, 8)] +CORE_NUM_DENOM = [(1, 1), (1, 2), (2, 2), (4, 2), + (1, 4), (2, 4), (3, 4), (4, 4), (5, 4), + (1, 8), (2, 8), (3, 8), (6, 8), (9, 8), (12, 8)] +VALID_TIME_SIGNATURES = ['time_signature_' + str(x[0]) + '/' + str(x[1]) for x in REGULAR_NUM_DENOM] + +# cover possible time signatures +REGULAR_TICKS_PER_BEAT = [48, 96, 192, 384, 120, 240, 480, 960, 256, 512, 1024] diff --git a/data_representation/encoding_utils.py b/data_representation/encoding_utils.py new file mode 100644 index 0000000..a4695cf --- /dev/null +++ b/data_representation/encoding_utils.py @@ -0,0 +1,879 @@ +from typing import Any +from fractions import Fraction +from collections import defaultdict + +from miditoolkit import TimeSignature + +from constants import * + +''' +This script contains specific encoding functions for different encoding schemes. +''' + +def frange(start, stop, step): + while start < stop: + yield start + start += step + +################################# for REMI style encoding ################################# + +class Corpus2event_remi(): + def __init__(self, num_features:int): + self.num_features = num_features + + def _create_event(self, name, value): + event = dict() + event['name'] = name + event['value'] = value + return event + def _break_down_numerator(self, numerator, possible_time_signatures): + """Break down a numerator into smaller time signatures. + + Args: + numerator: Target numerator to decompose (must be > 0). + possible_time_signatures: List of (numerator, denominator) tuples, + sorted in descending order (e.g., [(4,4), (3,4)]). + + Returns: + List of decomposed time signatures (e.g., [(4,4), (3,4)]). + + Raises: + ValueError: If decomposition is impossible. + """ + if numerator <= 0: + raise ValueError("Numerator must be positive.") + if not possible_time_signatures: + raise ValueError("No possible time signatures provided.") + + result = [] + original_numerator = numerator # For error message + + # Sort signatures in descending order to prioritize larger chunks + possible_time_signatures = sorted(possible_time_signatures, key=lambda x: -x[0]) + + while numerator > 0: + subtracted = False # Track if any subtraction occurred in this iteration + + for sig in possible_time_signatures: + sig_numerator, _ = sig + if sig_numerator <= 0: + continue # Skip invalid signatures + + while numerator >= sig_numerator: + result.append(sig) + numerator -= sig_numerator + subtracted = True + + # If no progress was made, decomposition failed + if not subtracted: + raise ValueError( + f"Cannot decompose numerator {original_numerator} " + f"with given time signatures {possible_time_signatures}. " + f"Remaining: {numerator}" + ) + + return result + def _normalize_time_signature(self, time_signature, ticks_per_beat, next_change_point): + """ + Normalize irregular time signatures to standard ones by breaking them down + into common time signatures, and adjusting their durations to fit the given + musical structure. + + Parameters: + - time_signature: TimeSignature object with numerator, denominator, and start time. + - ticks_per_beat: Number of ticks per beat, representing the resolution of the timing. + - next_change_point: Tick position where the next time signature change occurs. + + Returns: + - A list of TimeSignature objects, normalized to fit within regular time signatures. + + Procedure: + 1. If the time signature is already a standard one (in REGULAR_NUM_DENOM), return it. + 2. For non-standard signatures, break them down into simpler, well-known signatures. + - For unusual denominations (e.g., 16th, 32nd, or 64th notes), normalize to 4/4. + - For 6/4 signatures, break it into two 3/4 measures. + 3. If the time signature has a non-standard numerator and denominator, break it down + into the largest possible numerators that still fit within the denominator. + This ensures that the final measure fits within the regular time signature format. + 4. Calculate the resolution (duration in ticks) for each bar and ensure the bars + fit within the time until the next change point. + - Adjust the number of bars if they exceed the available space. + - If the total length is too short, repeat the first (largest) bar to fill the gap. + 5. Convert the breakdown into TimeSignature objects and return the normalized result. + """ + + # Check if the time signature is a regular one, return it if so + if (time_signature.numerator, time_signature.denominator) in REGULAR_NUM_DENOM: + return [time_signature] + + # Extract time signature components + numerator, denominator, bar_start_tick = time_signature.numerator, time_signature.denominator, time_signature.time + + # Normalize time signatures with 16th, 32nd, or 64th note denominators to 4/4 + if denominator in [16, 32, 64]: + return [TimeSignature(4, 4, time_signature.time)] + + # Special case for 6/4, break it into two 3/4 bars + elif denominator == 6 and numerator == 4: + return [TimeSignature(3, 4, time_signature.time), TimeSignature(3, 4, time_signature.time)] + + # Determine possible regular signatures for the given denominator + possible_time_signatures = [sig for sig in CORE_NUM_DENOM if sig[1] == denominator] + + # Sort by numerator in descending order to prioritize larger numerators + possible_time_signatures.sort(key=lambda x: x[0], reverse=True) + + result = [] + + # Break down the numerator into smaller regular numerators + max_iterations = 100 # Prevent infinite loops + original_numerator = numerator # Store original for error message + + # Break down the numerator into smaller regular numerators + iteration_count = 0 + while numerator > 0: + iteration_count += 1 + if iteration_count > max_iterations: + raise ValueError( + f"Failed to normalize time signature {original_numerator}/{denominator}. " + f"Could not break down numerator {original_numerator} with available signatures: " + f"{possible_time_signatures}" + ) + + for sig in possible_time_signatures: + # Subtract numerators and add to the result + while numerator >= sig[0]: + result.append(sig) + numerator -= sig[0] + + + + # Calculate the resolution (length in ticks) of each bar + bar_resol_list = [int(ticks_per_beat * numerator * (4 / denominator)) for numerator, denominator in result] + + # Adjust bars to fit within the remaining ticks before the next change point + total_length = 0 + for idx, bar_resol in enumerate(bar_resol_list): + total_length += bar_resol + if total_length > next_change_point - bar_start_tick: + result = result[:idx+1] + break + + # If the total length is too short, repeat the first (largest) bar until the gap is filled + while total_length < next_change_point - bar_start_tick: + result.append(result[0]) + total_length += int(ticks_per_beat * result[0][0] * (4 / result[0][1])) + + # Recalculate bar resolutions for the final result + bar_resol_list = [int(ticks_per_beat * numerator * (4 / denominator)) for numerator, denominator in result] + + # Insert a starting resolution of 0 and calculate absolute tick positions for each TimeSignature + bar_resol_list.insert(0, 0) + total_length = bar_start_tick + normalized_result = [] + for sig, length in zip(result, bar_resol_list): + total_length += length + normalized_result.append(TimeSignature(sig[0], sig[1], total_length)) + + return normalized_result + + def _process_time_signature(self, time_signature_changes, ticks_per_beat, first_note_tick, global_end): + """ + Process and normalize time signature changes for a given musical piece. + + Parameters: + - time_signature_changes: A list of TimeSignature objects representing time signature changes in the music. + - ticks_per_beat: The resolution of timing in ticks per beat. + - first_note_tick: The tick position of the first note in the piece. + - global_end: The tick position where the piece ends. + + Returns: + - A list of processed and normalized time signature changes. If no valid time signature + changes are found, returns None. + + Procedure: + 1. Check the validity of the time signature changes: + - Ensure there is at least one time signature change. + - Ensure the first time signature change occurs at the beginning (before the first note). + 2. Remove duplicate consecutive time signatures: + - Only add time signatures that differ from the previous one (de-duplication). + 3. Normalize the time signatures: + - For each time signature, determine its duration by calculating the time until the + next change point or the end of the piece. + - Use the _normalize_time_signature method to break down non-standard signatures into + simpler, well-known signatures that fit within the musical structure. + 4. Return the processed and normalized time signature changes. + + """ + + # Check if there are any time signature changes + if len(time_signature_changes) == 0: + print("No time signature change in this tune, default to 4/4 time signature") + # default to 4/4 time signature if none are found + return [TimeSignature(4, 4, 0)] + + # Ensure the first time signature change is at the start of the piece (before the first note) + if time_signature_changes[0].time != 0 and time_signature_changes[0].time > first_note_tick: + print("The first time signature change is not at the beginning of the tune") + return None + + # Remove consecutive duplicate time signatures (de-duplication) + processed_time_signature_changes = [] + for idx, time_sig in enumerate(time_signature_changes): + if idx == 0: + processed_time_signature_changes.append(time_sig) + else: + prev_time_sig = time_signature_changes[idx-1] + # Only add time signature if it's different from the previous one + if not (prev_time_sig.numerator == time_sig.numerator and prev_time_sig.denominator == time_sig.denominator): + processed_time_signature_changes.append(time_sig) + + # Normalize the time signatures to standard formats + normalized_time_signature_changes = [] + for idx, time_signature in enumerate(processed_time_signature_changes): + if idx == len(time_signature_changes) - 1: + # If it's the last time signature change, set the next change point as the end of the piece + next_change_point = global_end + else: + # Otherwise, set the next change point as the next time signature's start time + next_change_point = time_signature_changes[idx+1].time + + # Normalize the current time signature and extend the result + normalized_time_signature_changes.extend(self._normalize_time_signature(time_signature, ticks_per_beat, next_change_point)) + + # Return the list of processed and normalized time signatures + time_signature_changes = normalized_time_signature_changes + return time_signature_changes + + def _half_step_interval_gap_check_across_instruments(self, instrument_note_dict): + ''' + This function checks for half-step interval gaps between notes across different instruments. + It will avoid half-step intervals by keeping one note from any pair of notes that are a half-step apart, + regardless of which instrument they belong to. + ''' + # order instrument_note_dict by pitch in descending order + instrument_note_dict = dict(sorted(instrument_note_dict.items())) + + # Create a dictionary to store all pitches across instruments + all_pitches = {} + + # Collect all pitches from each instrument and sort them in descending order + for instrument, notes in instrument_note_dict.items(): + for pitch, durations in notes.items(): + all_pitches[pitch] = all_pitches.get(pitch, []) + [(instrument, durations)] + + # Sort the pitches in descending order + sorted_pitches = sorted(all_pitches.keys(), reverse=True) + + # Create a new list to store the final pitches after comparison + final_pitch_list = [] + + # Use an index pointer to control the sliding window + idx = 0 + while idx < len(sorted_pitches) - 1: + current_pitch = sorted_pitches[idx] + next_pitch = sorted_pitches[idx + 1] + + if current_pitch - next_pitch == 1: # Check for a half-step interval gap + current_max_duration = max(duration for _, durations in all_pitches[current_pitch] for duration, _ in durations) + next_max_duration = max(duration for _, durations in all_pitches[next_pitch] for duration, _ in durations) + + if current_max_duration < next_max_duration: + # Keep the higher pitch (next_pitch) and skip the current_pitch + final_pitch_list.append(next_pitch) + else: + # Keep the lower pitch (current_pitch) and skip the next_pitch + final_pitch_list.append(current_pitch) + + # Skip the next pitch because we already handled it + idx += 2 + else: + # No half-step gap, keep the current pitch and move to the next one + final_pitch_list.append(current_pitch) + idx += 1 + + # Ensure the last pitch is added if it's not part of a half-step interval + if idx == len(sorted_pitches) - 1: + final_pitch_list.append(sorted_pitches[-1]) + + # Filter out notes not in the final pitch list and update the instrument_note_dict + for instrument in instrument_note_dict.keys(): + instrument_note_dict[instrument] = { + pitch: instrument_note_dict[instrument][pitch] + for pitch in sorted(instrument_note_dict[instrument].keys(), reverse=True) if pitch in final_pitch_list + } + + return instrument_note_dict + + def __call__(self, song_data, in_beat_resolution): + ''' + Process a song's data to generate a sequence of musical events, including bars, chords, tempo, + and notes, similar to the approach used in the CP paper (corpus2event_remi_v2). + + Parameters: + - song_data: A dictionary containing metadata, notes, chords, and tempos of the song. + - in_beat_resolution: The resolution of timing in beats (how many divisions per beat). + + Returns: + - A sequence of musical events including start (SOS), bars, chords, tempo, instruments, notes, + and an end (EOS) event. If the time signature is invalid, returns None. + + Procedure: + 1. **Global Setup**: + - Extract global metadata like first and last note ticks, time signature changes, and ticks + per beat. + - Compute `in_beat_tick_resol`, the ratio of ticks per beat to the input beat resolution, + to assist in dividing bars later. + - Get a sorted list of instruments in the song. + + 2. **Time Signature Processing**: + - Call `_process_time_signature` to clean up and normalize the time signatures in the song. + - If the time signatures are invalid (e.g., no time signature changes or missing at the + start), the function exits early with None. + + 3. **Sequence Generation**: + - Initialize the sequence with a start token (SOS) and prepare variables for tracking + previous chord, tempo, and instrument states. + - Loop through each time signature change, dividing the song into measures based on the + current time signature's numerator and denominator. + - For each measure, append "Bar" tokens to mark measure boundaries, while ensuring that no + more than four consecutive empty bars are added. + - For each step within a measure, process the following: + - **Chords**: If there is a chord change, add a corresponding chord event. + - **Tempo**: If the tempo changes, add a tempo event. + - **Notes**: Iterate over each instrument, adding notes and checking for half-step + intervals, deduplicating notes, and choosing the longest duration for each pitch. + - Append a "Beat" event for each step with musical events. + + 4. **End Sequence**: + - Conclude the sequence by appending a final "Bar" token followed by an end token (EOS). + ''' + + # --- global tag --- # + first_note_tick = song_data['metadata']['first_note'] # Starting tick of the first note + global_end = song_data['metadata']['last_note'] # Ending tick of the last note + time_signature_changes = song_data['metadata']['time_signature'] # Time signature changes + ticks_per_beat = song_data['metadata']['ticks_per_beat'] # Ticks per beat resolution + # Resolution for dividing beats within measures, expressed as a fraction + in_beat_tick_resol = Fraction(ticks_per_beat, in_beat_resolution) # Example: 1024/12 -> (256, 3) + instrument_list = sorted(list(song_data['notes'].keys())) # Get a sorted list of instruments in the song + + # --- process time signature --- # + # Normalize and process the time signatures in the song + time_signature_changes = self._process_time_signature(time_signature_changes, ticks_per_beat, first_note_tick, global_end) + if time_signature_changes == None: + return None # Exit if time signature is invalid + + # --- create sequence --- # + prev_instr_idx = None # Track the previously processed instrument + final_sequence = [] + final_sequence.append(self._create_event('SOS', None)) # Add Start of Sequence (SOS) token + prev_chord = None # Track the previous chord + prev_tempo = None # Track the previous tempo + chord_value = None + tempo_value = None + + # Process each time signature change + for idx in range(len(time_signature_changes)): + time_sig_change_flag = True # Flag to indicate a time signature change + # Calculate bar resolution based on the current time signature + numerator = time_signature_changes[idx].numerator + denominator = time_signature_changes[idx].denominator + time_sig_name = f'time_signature_{numerator}/{denominator}' # Format time signature name + bar_resol = int(ticks_per_beat * numerator * (4 / denominator)) # Calculate bar resolution in ticks + bar_start_tick = time_signature_changes[idx].time # Start tick of the current bar + # Determine the next time signature change point or the end of the song + if idx == len(time_signature_changes) - 1: + next_change_point = global_end + else: + next_change_point = time_signature_changes[idx+1].time + + # Process each measure within the current time signature + for measure_step in frange(bar_start_tick, next_change_point, bar_resol): + empty_bar_token = self._create_event('Bar', None) # Token for empty bars + + # Ensure no more than 4 consecutive empty bars are added + if len(final_sequence) >= 4: + if not (final_sequence[-1] == empty_bar_token and final_sequence[-2] == empty_bar_token and + final_sequence[-3] == empty_bar_token and final_sequence[-4] == empty_bar_token): + if time_sig_change_flag: + final_sequence.append(self._create_event('Bar', time_sig_name)) # Mark new bar with time signature + else: + final_sequence.append(self._create_event('Bar', None)) + else: + if time_sig_change_flag: + final_sequence.append(self._create_event('Bar', time_sig_name)) + else: + if time_sig_change_flag: + final_sequence.append(self._create_event('Bar', time_sig_name)) + else: + final_sequence.append(self._create_event('Bar', None)) + + time_sig_change_flag = False # Reset time signature change flag + + # Process events within each beat + for in_beat_off_idx, beat_step in enumerate(frange(measure_step, measure_step + bar_resol, in_beat_tick_resol)): + events_list = [] + # Retrieve chords and tempos at the current beat step + t_chords = song_data['chords'].get(beat_step) + t_tempos = song_data['tempos'].get(beat_step) + + # Process chord and tempo if the number of features allows for it + if self.num_features in {8, 7}: + if t_chords is not None: + root, quality, _ = t_chords[-1].text.split('_') # Extract chord info + chord_value = root + '_' + quality + if t_tempos is not None: + tempo_value = t_tempos[-1].tempo # Extract tempo value + + # Dictionary to track notes for each instrument to avoid duplicates + instrument_note_dict = defaultdict(dict) + + # Process notes for each instrument at the current beat step + for instrument_idx in instrument_list: + t_notes = song_data['notes'][instrument_idx].get(beat_step) + + # If there are notes at this beat step, process them. + if t_notes is not None: + # Track notes to avoid duplicates and check for half-step intervals + for note in t_notes: + if note.pitch not in instrument_note_dict[instrument_idx]: + instrument_note_dict[instrument_idx][note.pitch] = [(note.quantized_duration, note.velocity)] + else: + instrument_note_dict[instrument_idx][note.pitch].append((note.quantized_duration, note.velocity)) + + if len(instrument_note_dict) == 0: + continue + + # Check for half-step interval gaps and handle them across instruments + pruned_instrument_note_dict = self._half_step_interval_gap_check_across_instruments(instrument_note_dict) + + # add chord and tempo + if self.num_features in {7, 8}: + if prev_chord != chord_value: + events_list.append(self._create_event('Chord', chord_value)) + prev_chord = chord_value + if prev_tempo != tempo_value: + events_list.append(self._create_event('Tempo', tempo_value)) + prev_tempo = tempo_value + + # add instrument and note + for instrument in pruned_instrument_note_dict: + if self.num_features in {5, 8}: + events_list.append(self._create_event('Instrument', instrument)) + + for pitch in pruned_instrument_note_dict[instrument]: + max_duration = max(pruned_instrument_note_dict[instrument][pitch], key=lambda x: x[0]) + note_event = [ + self._create_event('Note_Pitch', pitch), + self._create_event('Note_Duration', max_duration[0]) + ] + if self.num_features in {7, 8}: + note_event.append(self._create_event('Note_Velocity', max_duration[1])) + events_list.extend(note_event) + + # If there are events in this step, add a "Beat" event and the collected events + if len(events_list): + final_sequence.append(self._create_event('Beat', in_beat_off_idx)) + final_sequence.extend(events_list) + + # --- end with BAR & EOS --- # + final_sequence.append(self._create_event('Bar', None)) # Add final bar token + final_sequence.append(self._create_event('EOS', None)) # Add End of Sequence (EOS) token + return final_sequence + +################################# for CP style encoding ################################# + +class Corpus2event_cp(Corpus2event_remi): + def __init__(self, num_features): + super().__init__(num_features) + self.num_features = num_features + self._init_event_template() + + def _init_event_template(self): + ''' + The order of musical features is Type, Beat, Chord, Tempo, Instrument, Pitch, Duration, Velocity + ''' + self.event_template = {} + if self.num_features == 8: + feature_names = ['type', 'beat', 'chord', 'tempo', 'instrument', 'pitch', 'duration', 'velocity'] + elif self.num_features == 7: + feature_names = ['type', 'beat', 'chord', 'tempo', 'pitch', 'duration', 'velocity'] + elif self.num_features == 5: + feature_names = ['type', 'beat', 'instrument', 'pitch', 'duration'] + elif self.num_features == 4: + feature_names = ['type', 'beat', 'pitch', 'duration'] + for feature_name in feature_names: + self.event_template[feature_name] = 0 + + def create_cp_sos_event(self): + total_event = self.event_template.copy() + total_event['type'] = 'SOS' + return total_event + + def create_cp_eos_event(self): + total_event = self.event_template.copy() + total_event['type'] = 'EOS' + return total_event + + def create_cp_metrical_event(self, pos, chord, tempo): + ''' + when the compound token is related to metrical information + ''' + meter_event = self.event_template.copy() + meter_event['type'] = 'Metrical' + meter_event['beat'] = pos + if self.num_features == 7 or self.num_features == 8: + meter_event['chord'] = chord + meter_event['tempo'] = tempo + return meter_event + + def create_cp_note_event(self, instrument_name, pitch, duration, velocity): + ''' + when the compound token is related to note information + ''' + note_event = self.event_template.copy() + note_event['type'] = 'Note' + note_event['pitch'] = pitch + note_event['duration'] = duration + if self.num_features == 5 or self.num_features == 8: + note_event['instrument'] = instrument_name + if self.num_features == 7 or self.num_features == 8: + note_event['velocity'] = velocity + return note_event + + def create_cp_bar_event(self, time_sig_change_flag=False, time_sig_name=None): + meter_event = self.event_template.copy() + if time_sig_change_flag: + meter_event['type'] = 'Metrical' + meter_event['beat'] = f'Bar_{time_sig_name}' + else: + meter_event['type'] = 'Metrical' + meter_event['beat'] = 'Bar' + return meter_event + + def __call__(self, song_data, in_beat_resolution): + # --- global tag --- # + first_note_tick = song_data['metadata']['first_note'] # First note timestamp in ticks + global_end = song_data['metadata']['last_note'] # Last note timestamp in ticks + time_signature_changes = song_data['metadata']['time_signature'] # Time signature changes throughout the song + ticks_per_beat = song_data['metadata']['ticks_per_beat'] # Ticks per beat (resolution of the timing grid) + in_beat_tick_resol = Fraction(ticks_per_beat, in_beat_resolution) # Tick resolution for beats + instrument_list = sorted(list(song_data['notes'].keys())) # List of instruments in the song + + # --- process time signature --- # + # Process time signature changes and adjust them for the given song structure + time_signature_changes = self._process_time_signature(time_signature_changes, ticks_per_beat, first_note_tick, global_end) + if time_signature_changes == None: + return None # Exit if no valid time signature changes found + + # --- create sequence --- # + final_sequence = [] # Initialize the final sequence to store the events + final_sequence.append(self.create_cp_sos_event()) # Add the Start-of-Sequence (SOS) event + chord_text = None # Placeholder for the current chord + tempo_text = None # Placeholder for the current tempo + + # Loop through each time signature change and process the corresponding measures + for idx in range(len(time_signature_changes)): + time_sig_change_flag = True # Flag to track when time signature changes + # Calculate bar resolution (number of ticks per bar based on the time signature) + numerator = time_signature_changes[idx].numerator + denominator = time_signature_changes[idx].denominator + time_sig_name = f'time_signature_{numerator}/{denominator}' # Format the time signature as a string + bar_resol = int(ticks_per_beat * numerator * (4 / denominator)) # Calculate number of ticks per bar + bar_start_tick = time_signature_changes[idx].time # Starting tick for this time signature + + # Determine the point for the next time signature change or the end of the song + if idx == len(time_signature_changes) - 1: + next_change_point = global_end + else: + next_change_point = time_signature_changes[idx + 1].time + + # Iterate over each measure (bar) between the current and next time signature change + for measure_step in frange(bar_start_tick, next_change_point, bar_resol): + empty_bar_token = self.create_cp_bar_event() # Create an empty bar event + + # Check if the last four events in the sequence are consecutive empty bars + if len(final_sequence) >= 4: + if not (final_sequence[-1] == empty_bar_token and final_sequence[-2] == empty_bar_token and final_sequence[-3] == empty_bar_token and final_sequence[-4] == empty_bar_token): + final_sequence.append(self.create_cp_bar_event(time_sig_change_flag, time_sig_name)) + else: + if time_sig_change_flag: + final_sequence.append(self.create_cp_bar_event(time_sig_change_flag, time_sig_name)) + else: + final_sequence.append(self.create_cp_bar_event(time_sig_change_flag, time_sig_name)) + + # Reset the time signature change flag after handling the bar event + time_sig_change_flag = False + + # Loop through beats in each measure based on the in-beat resolution + for in_beat_off_idx, beat_step in enumerate(frange(measure_step, measure_step + bar_resol, in_beat_tick_resol)): + chord_tempo_flag = False # Flag to track if chord and tempo events are added + events_list = [] # List to hold events for the current beat + pos_text = 'Beat_' + str(in_beat_off_idx) # Create a beat event label + + # --- chord & tempo processing --- # + # Unpack chords and tempos for the current beat step + t_chords = song_data['chords'].get(beat_step) + t_tempos = song_data['tempos'].get(beat_step) + + # If a chord is present, extract its root, quality, and bass + if self.num_features in {7, 8}: + if t_chords is not None: + root, quality, _ = t_chords[-1].text.split('_') + chord_text = 'Chord_' + root + '_' + quality + + # If a tempo is present, format it as a string + if t_tempos is not None: + tempo_text = 'Tempo_' + str(t_tempos[-1].tempo) + + # Dictionary to track notes for each instrument to avoid duplicates + instrument_note_dict = defaultdict(dict) + + # --- instrument & note processing --- # + # Loop through each instrument and process its notes at the current beat step + for instrument_idx in instrument_list: + t_notes = song_data['notes'][instrument_idx].get(beat_step) + + # If notes are present, process them + if t_notes != None: + # Track notes and their properties (duration and velocity) for the current instrument + for note in t_notes: + if note.pitch not in instrument_note_dict[instrument_idx]: + instrument_note_dict[instrument_idx][note.pitch] = [(note.quantized_duration, note.velocity)] + else: + instrument_note_dict[instrument_idx][note.pitch].append((note.quantized_duration, note.velocity)) + + if len(instrument_note_dict) == 0: + continue + + # Check for half-step interval gaps and handle them across instruments + pruned_instrument_note_dict = self._half_step_interval_gap_check_across_instruments(instrument_note_dict) + + # add chord and tempo + if self.num_features in {7, 8}: + if not chord_tempo_flag: + if chord_text == None: + chord_text = 'Chord_N_N' + if tempo_text == None: + tempo_text = 'Tempo_N_N' + chord_tempo_flag = True + + events_list.append(self.create_cp_metrical_event(pos_text, chord_text, tempo_text)) + + # add instrument and note + for instrument_idx in pruned_instrument_note_dict: + instrument_name = 'Instrument_' + str(instrument_idx) + for pitch in pruned_instrument_note_dict[instrument_idx]: + max_duration = max(pruned_instrument_note_dict[instrument_idx][pitch], key=lambda x: x[0]) + note_pitch_text = 'Note_Pitch_' + str(pitch) + note_duration_text = 'Note_Duration_' + str(max_duration[0]) + note_velocity_text = 'Note_Velocity_' + str(max_duration[1]) + events_list.append(self.create_cp_note_event(instrument_name, note_pitch_text, note_duration_text, note_velocity_text)) + + # If there are any events for this beat, add them to the final sequence + if len(events_list) > 0: + final_sequence.extend(events_list) + + # --- end with BAR & EOS --- # + final_sequence.append(self.create_cp_bar_event()) # Add the final bar event + final_sequence.append(self.create_cp_eos_event()) # Add the End-of-Sequence (EOS) event + return final_sequence # Return the final sequence of events + +################################# for NB style encoding ################################# + +class Corpus2event_nb(Corpus2event_cp): + def __init__(self, num_features): + ''' + For convenience in logging, we use "type" word for "metric" sub-token in the code to compare easily with other encoding schemes + ''' + super().__init__(num_features) + self.num_features = num_features + self._init_event_template() + + def _init_event_template(self): + self.event_template = {} + if self.num_features == 8: + feature_names = ['type', 'beat', 'chord', 'tempo', 'instrument', 'pitch', 'duration', 'velocity'] + elif self.num_features == 7: + feature_names = ['type', 'beat', 'chord', 'tempo', 'pitch', 'duration', 'velocity'] + elif self.num_features == 5: + feature_names = ['type', 'beat', 'instrument', 'pitch', 'duration'] + elif self.num_features == 4: + feature_names = ['type', 'beat', 'pitch', 'duration'] + for feature_name in feature_names: + self.event_template[feature_name] = 0 + + def create_nb_sos_event(self): + total_event = self.event_template.copy() + total_event['type'] = 'SOS' + return total_event + + def create_nb_eos_event(self): + total_event = self.event_template.copy() + total_event['type'] = 'EOS' + return total_event + + def create_nb_event(self, bar_beat_type, pos, chord, tempo, instrument_name, pitch, duration, velocity): + total_event = self.event_template.copy() + total_event['type'] = bar_beat_type + total_event['beat'] = pos + total_event['pitch'] = pitch + total_event['duration'] = duration + if self.num_features in {5, 8}: + total_event['instrument'] = instrument_name + if self.num_features in {7, 8}: + total_event['chord'] = chord + total_event['tempo'] = tempo + total_event['velocity'] = velocity + return total_event + + def create_nb_empty_bar_event(self): + total_event = self.event_template.copy() + total_event['type'] = 'Empty_Bar' + return total_event + + def get_bar_beat_idx(self, bar_flag, beat_flag, time_sig_name, time_sig_change_flag): + ''' + This function is to get the metric information for the current bar and beat + There are four types of metric information: NNN, SNN, SSN, SSS + Each letter represents the change of time signature, bar, and beat (new or same) + ''' + if time_sig_change_flag: # new time signature + return "NNN_" + time_sig_name + else: + if bar_flag and beat_flag: # same time sig & new bar & new beat + return "SNN" + elif not bar_flag and beat_flag: # same time sig & same bar & new beat + return "SSN" + elif not bar_flag and not beat_flag: # same time sig & same bar & same beat + return "SSS" + + def __call__(self, song_data, in_beat_resolution:int): + # --- global tag --- # + first_note_tick = song_data['metadata']['first_note'] # First note timestamp in ticks + global_end = song_data['metadata']['last_note'] # Last note timestamp in ticks + time_signature_changes = song_data['metadata']['time_signature'] # Time signature changes throughout the song + ticks_per_beat = song_data['metadata']['ticks_per_beat'] # Ticks per beat (resolution of the timing grid) + in_beat_tick_resol = Fraction(ticks_per_beat, in_beat_resolution) # Tick resolution for beats + instrument_list = sorted(list(song_data['notes'].keys())) # List of instruments in the song + + # --- process time signature --- # + # Process time signature changes and adjust them for the given song structure + time_signature_changes = self._process_time_signature(time_signature_changes, ticks_per_beat, first_note_tick, global_end) + if time_signature_changes == None: + return None # Exit if no valid time signature changes found + + # --- create sequence --- # + final_sequence = [] # Initialize the final sequence to store the events + final_sequence.append(self.create_nb_sos_event()) # Add the Start-of-Sequence (SOS) event + chord_text = None # Placeholder for the current chord + tempo_text = None # Placeholder for the current tempo + + # Loop through each time signature change and process the corresponding measures + for idx in range(len(time_signature_changes)): + time_sig_change_flag = True # Flag to track when time signature changes + # Calculate bar resolution (number of ticks per bar based on the time signature) + numerator = time_signature_changes[idx].numerator + denominator = time_signature_changes[idx].denominator + time_sig_name = f'time_signature_{numerator}/{denominator}' # Format the time signature as a string + bar_resol = int(ticks_per_beat * numerator * (4 / denominator)) # Calculate number of ticks per bar + bar_start_tick = time_signature_changes[idx].time # Starting tick for this time signature + + # Determine the point for the next time signature change or the end of the song + if idx == len(time_signature_changes) - 1: + next_change_point = global_end + else: + next_change_point = time_signature_changes[idx + 1].time + + # Iterate over each measure (bar) between the current and next time signature change + for measure_step in frange(bar_start_tick, next_change_point, bar_resol): + bar_flag = True + note_flag = False + + # Loop through beats in each measure based on the in-beat resolution + for in_beat_off_idx, beat_step in enumerate(frange(measure_step, measure_step + bar_resol, in_beat_tick_resol)): + beat_flag = True + events_list = [] + pos_text = 'Beat_' + str(in_beat_off_idx) + + # --- chord & tempo processing --- # + # Unpack chords and tempos for the current beat step + t_chords = song_data['chords'].get(beat_step) + t_tempos = song_data['tempos'].get(beat_step) + + # If a chord is present, extract its root, quality, and bass + if self.num_features == 8 or self.num_features == 7: + if t_chords is not None: + root, quality, _ = t_chords[-1].text.split('_') + chord_text = 'Chord_' + root + '_' + quality + + # If a tempo is present, format it as a string + if t_tempos is not None: + tempo_text = 'Tempo_' + str(t_tempos[-1].tempo) + + # Dictionary to track notes for each instrument to avoid duplicates + instrument_note_dict = defaultdict(dict) + + # --- instrument & note processing --- # + # Loop through each instrument and process its notes at the current beat step + for instrument_idx in instrument_list: + t_notes = song_data['notes'][instrument_idx].get(beat_step) + + # If notes are present, process them + if t_notes != None: + note_flag = True + + # Track notes and their properties (duration and velocity) for the current instrument + for note in t_notes: + if note.pitch not in instrument_note_dict[instrument_idx]: + instrument_note_dict[instrument_idx][note.pitch] = [(note.quantized_duration, note.velocity)] + else: + instrument_note_dict[instrument_idx][note.pitch].append((note.quantized_duration, note.velocity)) + + # # Check for half-step interval gaps and handle them accordingly + # self._half_step_interval_gap_check(instrument_note_dict, instrument_idx) + + if len(instrument_note_dict) == 0: + continue + + # Check for half-step interval gaps and handle them across instruments + pruned_instrument_note_dict = self._half_step_interval_gap_check_across_instruments(instrument_note_dict) + + # add chord and tempo + if self.num_features in {7, 8}: + if chord_text == None: + chord_text = 'Chord_N_N' + if tempo_text == None: + tempo_text = 'Tempo_N_N' + + # add instrument and note + for instrument_idx in pruned_instrument_note_dict: + instrument_name = 'Instrument_' + str(instrument_idx) + for pitch in pruned_instrument_note_dict[instrument_idx]: + max_duration = max(pruned_instrument_note_dict[instrument_idx][pitch], key=lambda x: x[0]) + note_pitch_text = 'Note_Pitch_' + str(pitch) + note_duration_text = 'Note_Duration_' + str(max_duration[0]) + note_velocity_text = 'Note_Velocity_' + str(max_duration[1]) + bar_beat_type = self.get_bar_beat_idx(bar_flag, beat_flag, time_sig_name, time_sig_change_flag) + events_list.append(self.create_nb_event(bar_beat_type, pos_text, chord_text, tempo_text, instrument_name, note_pitch_text, note_duration_text, note_velocity_text)) + bar_flag = False + beat_flag = False + time_sig_change_flag = False + + # If there are any events for this beat, add them to the final sequence + if events_list != None and len(events_list): + final_sequence.extend(events_list) + + # when there is no note in this bar + if not note_flag: + # avoid consecutive empty bars (more than 4 is not allowed) + empty_bar_token = self.create_nb_empty_bar_event() + if len(final_sequence) >= 4: + if final_sequence[-1] == empty_bar_token and final_sequence[-2] == empty_bar_token and final_sequence[-3] == empty_bar_token and final_sequence[-4] == empty_bar_token: + continue + final_sequence.append(empty_bar_token) + + # --- end with BAR & EOS --- # + final_sequence.append(self.create_nb_eos_event()) + return final_sequence \ No newline at end of file diff --git a/data_representation/step1_midi2corpus.py b/data_representation/step1_midi2corpus.py new file mode 100644 index 0000000..815fc8f --- /dev/null +++ b/data_representation/step1_midi2corpus.py @@ -0,0 +1,650 @@ +import argparse +import time +import itertools +import copy +from copy import deepcopy +from pathlib import Path +from multiprocessing import Pool, cpu_count +from collections import defaultdict +from fractions import Fraction +from typing import List +import os +from muspy import sort +import numpy as np +import pickle +from tqdm import tqdm + +import miditoolkit +from miditoolkit.midi.containers import Marker, Instrument +from chorder import Dechorder + +from constants import NUM2PITCH, PROGRAM_INSTRUMENT_MAP, INSTRUMENT_PROGRAM_MAP + +''' +This script is designed to preprocess MIDI files and convert them into a structured corpus suitable for symbolic music analysis or model training. +It handles various tasks, including setting beat resolution, calculating duration, velocity, and tempo bins, and processing MIDI data into quantized musical events. +''' + +def get_tempo_bin(max_tempo:int, ratio:float=1.1): + bpm = 30 + regular_tempo_bins = [bpm] + while bpm < max_tempo: + bpm *= ratio + bpm = round(bpm) + if bpm > max_tempo: + break + regular_tempo_bins.append(bpm) + return np.array(regular_tempo_bins) + +def split_markers(markers:List[miditoolkit.midi.containers.Marker]): + ''' + split markers into chord, tempo, label + ''' + chords = [] + for marker in markers: + splitted_text = marker.text.split('_') + if splitted_text[0] != 'global' and 'Boundary' not in splitted_text[0]: + chords.append(marker) + return chords + +class CorpusMaker(): + def __init__( + self, + dataset_name:str, + num_features:int, + in_dir:Path, + out_dir:Path, + debug:bool + ): + ''' + Initialize the CorpusMaker with dataset information and directory paths. + It sets up MIDI paths, output directories, and debug mode, then + retrieves the beat resolution, duration bins, velocity/tempo bins, and prepares the MIDI file list. + ''' + self.dataset_name = dataset_name + self.num_features = num_features + self.midi_path = in_dir / f"{dataset_name}" + self.out_dir = out_dir + self.debug = debug + self._get_in_beat_resolution() + self._get_duration_bins() + self._get_velocity_tempo_bins() + self._get_min_max_last_time() + self._prepare_midi_list() + + def _get_in_beat_resolution(self): + # Retrieve the resolution of quarter note based on the dataset name (e.g., 4 means the minimum resolution sets to 16th note) + in_beat_resolution_dict = {'BachChorale': 4, 'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4, 'SymphonyMIDI': 8} + try: + self.in_beat_resolution = in_beat_resolution_dict[self.dataset_name] + except KeyError: + print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean") + self.in_beat_resolution = in_beat_resolution_dict['LakhClean'] + + def _get_duration_bins(self): + # Set up regular duration bins for quantizing note lengths, based on the beat resolution. + base_duration = {4:[1,2,3,4,5,6,8,10,12,16,20,24,28,32], + 8:[1,2,3,4,6,8,10,12,14,16,20,24,28,32,36,40,48,56,64], + 12:[1,2,3,4,6,9,12,15,18,24,30,36,42,48,54,60,72,84,96]} + base_duration_list = base_duration[self.in_beat_resolution] + self.regular_duration_bins = np.array(base_duration_list) + + def _get_velocity_tempo_bins(self): + # Define velocity and tempo bins based on whether the dataset is a performance or score type. + midi_type_dict = {'BachChorale': 'score', 'Pop1k7': 'perform', 'Pop909': 'score', 'SOD': 'score', 'LakhClean': 'score', 'Symphony': 'score'} + try: + midi_type = midi_type_dict[self.dataset_name] + except KeyError: + print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean") + midi_type = midi_type_dict['LakhClean'] + # For performance-type datasets, set finer granularity of velocity and tempo bins. + if midi_type == 'perform': + self.regular_velocity_bins = np.array(list(range(40, 128, 8)) + [127]) + self.regular_tempo_bins = get_tempo_bin(max_tempo=240, ratio=1.04) + # For score-type datasets, use coarser velocity and tempo bins. + elif midi_type == 'score': + self.regular_velocity_bins = np.array([40, 60, 80, 100, 120]) + self.regular_tempo_bins = get_tempo_bin(max_tempo=390, ratio=1.04) + + def _get_min_max_last_time(self): + ''' + Set the minimum and maximum allowed length of a MIDI track, depending on the dataset. + 0 to 2000 means no limitation + ''' + # last_time_dict = {'BachChorale': (0, 2000), 'Pop1k7': (0, 2000), 'Pop909': (0, 2000), 'SOD': (60, 1000), 'LakhClean': (60, 600), 'Symphony': (60, 1500)} + last_time_dict = {'BachChorale': (0, 2000), 'Pop1k7': (0, 2000), 'Pop909': (0, 2000), 'SOD': (60, 1000), 'LakhClean': (0, 2000), 'Symphony': (60, 1500)} + try: + self.min_last_time, self.max_last_time = last_time_dict[self.dataset_name] + except KeyError: + print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean") + self.min_last_time, self.max_last_time = last_time_dict['LakhClean'] + + def _prepare_midi_list(self): + midi_path = Path(self.midi_path) + # detect subdirectories and get all midi files + if not midi_path.exists(): + raise ValueError(f"midi_path {midi_path} does not exist") + # go though all subdirectories and get all midi files + midi_files = [] + for root, _, files in os.walk(midi_path): + for file in files: + if file.endswith('.mid'): + # print(Path(root) / file) + midi_files.append(Path(root) / file) + self.midi_list = midi_files + print(f"Found {len(self.midi_list)} MIDI files in {midi_path}") + + def make_corpus(self) -> None: + ''' + Main method to process the MIDI files and create the corpus data. + It supports both single-processing (debug mode) and multi-processing for large datasets. + ''' + print("preprocessing midi data to corpus data") + # check the corpus folder is already exist and make it if not + Path(self.out_dir).mkdir(parents=True, exist_ok=True) + Path(self.out_dir / f"corpus_{self.dataset_name}").mkdir(parents=True, exist_ok=True) + Path(self.out_dir / f"midi_{self.dataset_name}").mkdir(parents=True, exist_ok=True) + start_time = time.time() + if self.debug: + # single processing for debugging + broken_counter = 0 + success_counter = 0 + for file_path in tqdm(self.midi_list, total=len(self.midi_list)): + message = self._mp_midi2corpus(file_path) + if message == "error": + broken_counter += 1 + elif message == "success": + success_counter += 1 + else: + # Multi-threaded processing for faster corpus generation. + broken_counter = 0 + success_counter = 0 + # filter out processed files + print(self.out_dir) + processed_files = list(Path(self.out_dir).glob(f"midi_{self.dataset_name}/*.mid")) + processed_files = [x.name for x in processed_files] + print(f"processed files: {len(processed_files)}") + print("length of midi list: ", len(self.midi_list)) + # Use set for faster lookup (O(1) per check) + processed_files_set = set(processed_files) + self.midi_list = [x for x in self.midi_list if x.name not in processed_files_set] + # reverse the list to process the latest files first + self.midi_list.reverse() + print(f"length of midi list after filtering: ", len(self.midi_list)) + with Pool(16) as p: + for message in tqdm(p.imap(self._mp_midi2corpus, self.midi_list, 1000), total=len(self.midi_list)): + if message == "error": + broken_counter += 1 + elif message == "success": + success_counter += 1 + # for file_path in tqdm(self.midi_list, total=len(self.midi_list)): + # message = self._mp_midi2corpus(file_path) + # if message == "error": + # broken_counter += 1 + # elif message == "success": + # success_counter += 1 + print(f"Making corpus takes: {time.time() - start_time}s, success: {success_counter}, broken: {broken_counter}") + + def _mp_midi2corpus(self, file_path: Path): + """Convert MIDI to corpus format and save both corpus (.pkl) and MIDI (.mid).""" + try: + midi_obj = self._analyze(file_path) + corpus, midi_obj = self._midi2corpus(midi_obj) + # --- 1. Save corpus (.pkl) --- + relative_path = file_path.relative_to(self.midi_path) # Get relative path from input dir + safe_name = str(relative_path).replace("/", "_").replace("\\", "_").replace(".mid", ".pkl") + save_path = Path(self.out_dir) / f"corpus_{self.dataset_name}" / safe_name + save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure dir exists + with save_path.open("wb") as f: + pickle.dump(corpus, f) + + # --- 2. Save MIDI (.mid) --- + midi_save_dir = Path("../dataset/represented_data/corpus") / f"midi_{self.dataset_name}" + midi_save_dir.mkdir(parents=True, exist_ok=True) + midi_save_path = midi_save_dir / file_path.name # Keep original MIDI filename + midi_obj.dump(midi_save_path) + + del midi_obj, corpus + return "success" + + except (OSError, EOFError, ValueError, KeyError, AssertionError) as e: + print(f"Error processing {file_path.name}: {e}") + return "error" + except Exception as e: + print(f"Unexpected error in {file_path.name}: {e}") + return "error" + def _check_length(self, last_time:float): + if last_time < self.min_last_time: + raise ValueError(f"last time {last_time} is out of range") + + def _analyze(self, midi_path:Path): + # Loads and analyzes a MIDI file, performing various checks and extracting chords. + midi_obj = miditoolkit.midi.parser.MidiFile(midi_path) + + # check length + mapping = midi_obj.get_tick_to_time_mapping() + last_time = mapping[midi_obj.max_tick] + self._check_length(last_time) + + for ins in midi_obj.instruments: + # delete instrument with no notes + if len(ins.notes) == 0: + midi_obj.instruments.remove(ins) + continue + notes = ins.notes + notes = sorted(notes, key=lambda x: (x.start, x.pitch)) + + # three steps to merge instruments + self._merge_percussion(midi_obj) + self._pruning_instrument(midi_obj) + self._limit_max_track(midi_obj) + + if self.num_features == 7 or self.num_features == 8: + # in case of 7 or 8 features, we need to extract chords + new_midi_obj = self._pruning_notes_for_chord_extraction(midi_obj) + chords = Dechorder.dechord(new_midi_obj) + markers = [] + for cidx, chord in enumerate(chords): + if chord.is_complete(): + chord_text = NUM2PITCH[chord.root_pc] + '_' + chord.quality + '_' + NUM2PITCH[chord.bass_pc] + else: + chord_text = 'N_N_N' + markers.append(Marker(time=int(cidx*new_midi_obj.ticks_per_beat), text=chord_text)) + + # de-duplication + prev_chord = None + dedup_chords = [] + for m in markers: + if m.text != prev_chord: + prev_chord = m.text + dedup_chords.append(m) + + # return midi + midi_obj.markers = dedup_chords + return midi_obj + + def _pruning_grouped_notes_from_quantization(self, instr_grid:dict): + ''' + In case where notes are grouped in the same quant_time but with different start time, unintentional chords are created + rule1: if notes have half step interval, delete the shorter one + rule2: if notes do not share 50% of duration of the shorter note, delete the shorter one + ''' + for instr in instr_grid.keys(): + time_list = sorted(list(instr_grid[instr].keys())) + for time in time_list: + notes = instr_grid[instr][time] + if len(notes) == 1: + continue + else: + new_notes = [] + # sort in pitch with ascending order + notes.sort(key=lambda x: x.pitch) + for i in range(len(notes)-1): + # if start time is same add to new_notes + if notes[i].start == notes[i+1].start: + new_notes.append(notes[i]) + new_notes.append(notes[i+1]) + continue + if notes[i].pitch == notes[i+1].pitch or notes[i].pitch + 1 == notes[i+1].pitch: + # select longer note + if notes[i].end - notes[i].start > notes[i+1].end - notes[i+1].start: + new_notes.append(notes[i]) + else: + new_notes.append(notes[i+1]) + else: + # check how much duration they share + shared_duration = min(notes[i].end, notes[i+1].end) - max(notes[i].start, notes[i+1].start) + shorter_duration = min(notes[i].end - notes[i].start, notes[i+1].end - notes[i+1].start) + # unless they share more than 80% of duration, select longer note (pruning shorter note) + if shared_duration / shorter_duration < 0.8: + if notes[i].end - notes[i].start > notes[i+1].end - notes[i+1].start: + new_notes.append(notes[i]) + else: + new_notes.append(notes[i+1]) + else: + if len(new_notes) == 0: + new_notes.append(notes[i]) + new_notes.append(notes[i+1]) + else: + new_notes.append(notes[i+1]) + instr_grid[instr][time] = new_notes + + def _midi2corpus(self, midi_obj:miditoolkit.midi.parser.MidiFile): + # Checks if the ticks per beat in the MIDI file is lower than the expected resolution. + # If it is, raise an error. + if midi_obj.ticks_per_beat < self.in_beat_resolution: + raise ValueError(f'[x] Irregular ticks_per_beat. {midi_obj.ticks_per_beat}') + + # Ensure there is at least one time signature change in the MIDI file. + # if len(midi_obj.time_signature_changes) == 0: + # raise ValueError('[x] No time_signature_changes') + + # Ensure there are no duplicated time signature changes. + # time_list = [ts.time for ts in midi_obj.time_signature_changes] + # if len(time_list) != len(set(time_list)): + # raise ValueError('[x] Duplicated time_signature_changes') + + # If the dataset is 'LakhClean' or 'SymphonyMIDI', verify there are at least 4 tracks. + # if self.dataset_name == 'LakhClean' or self.dataset_name == 'SymphonyMIDI': + # if len(midi_obj.instruments) < 4: + # raise ValueError('[x] We will use more than 4 tracks in Lakh Clean dataset.') + + # Calculate the resolution of ticks per beat as a fraction. + in_beat_tick_resol = Fraction(midi_obj.ticks_per_beat, self.in_beat_resolution) + + # Extract the initial time signature (numerator and denominator) and calculate the number of ticks for the first bar. + if len(midi_obj.time_signature_changes) != 0: + initial_numerator = midi_obj.time_signature_changes[0].numerator + initial_denominator = midi_obj.time_signature_changes[0].denominator + else: + # If no time signature changes, set default values + initial_numerator = 4 + initial_denominator = 4 + first_bar_resol = int(midi_obj.ticks_per_beat * initial_numerator * (4 / initial_denominator)) + + # --- load notes --- # + instr_notes = self._make_instr_notes(midi_obj) + # --- load information --- # + # load chords, labels + chords = split_markers(midi_obj.markers) + chords.sort(key=lambda x: x.time) + + + # load tempos + tempos = midi_obj.tempo_changes if len(midi_obj.tempo_changes) > 0 else [] + if len(tempos) == 0: + # if no tempo changes, set the default tempo to 120 BPM + tempos = [miditoolkit.midi.containers.TempoChange(time=0, tempo=120)] + tempos.sort(key=lambda x: x.time) + + # --- process items to grid --- # + # compute empty bar offset at head + first_note_time = min([instr_notes[k][0].start for k in instr_notes.keys()]) + last_note_time = max([instr_notes[k][-1].start for k in instr_notes.keys()]) + + quant_time_first = int(round(first_note_time / in_beat_tick_resol)) * in_beat_tick_resol + offset = quant_time_first // first_bar_resol # empty bar + offset_by_resol = offset * first_bar_resol + # --- process notes --- # + instr_grid = dict() + for key in instr_notes.keys(): + notes = instr_notes[key] + note_grid = defaultdict(list) + for note in notes: + # skip notes out of range, below C-1 and above C8 + if note.pitch < 12 or note.pitch >= 120: + continue + + # in case when the first note starts at slightly before the first bar + note.start = note.start - offset_by_resol if note.start - offset_by_resol > 0 else 0 + note.end = note.end - offset_by_resol if note.end - offset_by_resol > 0 else 0 + + # relative duration + # skip note with 0 duration + note_duration = note.end - note.start + relative_duration = round(note_duration / in_beat_tick_resol) + if relative_duration == 0: + continue + if relative_duration > self.in_beat_resolution * 8: # 8 beats + relative_duration = self.in_beat_resolution * 8 + + # use regular duration bins + note.quantized_duration = self.regular_duration_bins[np.argmin(abs(self.regular_duration_bins-relative_duration))] + + # quantize start time + quant_time = int(round(note.start / in_beat_tick_resol)) * in_beat_tick_resol + + # velocity + note.velocity = self.regular_velocity_bins[ + np.argmin(abs(self.regular_velocity_bins-note.velocity))] + + # append + note_grid[quant_time].append(note) + + # set to track + instr_grid[key] = note_grid + + # --- pruning grouped notes --- # + self._pruning_grouped_notes_from_quantization(instr_grid) + + # --- process chords --- # + chord_grid = defaultdict(list) + for chord in chords: + # quantize + chord.time = chord.time - offset_by_resol + chord.time = 0 if chord.time < 0 else chord.time + quant_time = int(round(chord.time / in_beat_tick_resol)) * in_beat_tick_resol + chord_grid[quant_time].append(chord) + + # --- process tempos --- # + + first_notes_list = [] + for instr in instr_grid.keys(): + time_list = sorted(list(instr_grid[instr].keys())) + if len(time_list) == 0: # 跳过空轨道 + continue + first_quant_time = time_list[0] + first_notes_list.append(first_quant_time) + + # 处理全空情况 + if not first_notes_list: + raise ValueError("[x] No valid notes found in any instrument track.") + quant_first_note_time = min(first_notes_list) + tempo_grid = defaultdict(list) + for tempo in tempos: + # quantize + tempo.time = tempo.time - offset_by_resol if tempo.time - offset_by_resol > 0 else 0 + quant_time = int(round(tempo.time / in_beat_tick_resol)) * in_beat_tick_resol + tempo.tempo = self.regular_tempo_bins[ + np.argmin(abs(self.regular_tempo_bins-tempo.tempo))] + if quant_time < quant_first_note_time: + tempo_grid[quant_first_note_time].append(tempo) + else: + tempo_grid[quant_time].append(tempo) + if len(tempo_grid[quant_first_note_time]) > 1: + tempo_grid[quant_first_note_time] = [tempo_grid[quant_first_note_time][-1]] + # --- process time signature --- # + quant_time_signature = deepcopy(midi_obj.time_signature_changes) + quant_time_signature.sort(key=lambda x: x.time) + for ts in quant_time_signature: + ts.time = ts.time - offset_by_resol if ts.time - offset_by_resol > 0 else 0 + ts.time = int(round(ts.time / in_beat_tick_resol)) * in_beat_tick_resol + + # --- make new midi object to check processed values --- # + new_midi_obj = miditoolkit.midi.parser.MidiFile() + new_midi_obj.ticks_per_beat = midi_obj.ticks_per_beat + new_midi_obj.max_tick = midi_obj.max_tick + for instr_idx in instr_grid.keys(): + new_instrument = Instrument(program=instr_idx) + new_instrument.notes = [y for x in instr_grid[instr_idx].values() for y in x] + new_midi_obj.instruments.append(new_instrument) + new_midi_obj.markers = [y for x in chord_grid.values() for y in x] + new_midi_obj.tempo_changes = [y for x in tempo_grid.values() for y in x] + new_midi_obj.time_signature_changes = midi_obj.time_signature_changes + + # make corpus + song_data = { + 'notes': instr_grid, + 'chords': chord_grid, + 'tempos': tempo_grid, + 'metadata': { + 'first_note': first_note_time, + 'last_note': last_note_time, + 'time_signature': quant_time_signature, + 'ticks_per_beat': midi_obj.ticks_per_beat, + } + } + return song_data, new_midi_obj + + def _make_instr_notes(self, midi_obj): + ''' + This part is important, we can use three different ways to merge instruments + 1st option: compare the number of notes and choose tracks with more notes + 2nd option: merge all instruments with the same tracks + 3rd option: leave all instruments as they are. differentiate tracks with different track number + + In this version we choose to use the 2nd option as it helps to reduce the number of tracks and sequence length + ''' + instr_notes = defaultdict(list) + for instr in midi_obj.instruments: + instr_idx = instr.program + # change instrument idx + instr_name = PROGRAM_INSTRUMENT_MAP.get(instr_idx) + if instr_name is None: + continue + new_instr_idx = INSTRUMENT_PROGRAM_MAP[instr_name] + instr_notes[new_instr_idx].extend(instr.notes) + instr_notes[new_instr_idx].sort(key=lambda x: (x.start, -x.pitch)) + return instr_notes + + # refered to SymphonyNet "https://github.com/symphonynet/SymphonyNet" + def _merge_percussion(self, midi_obj:miditoolkit.midi.parser.MidiFile): + ''' + merge drum track to one track + ''' + drum_0_lst = [] + new_instruments = [] + for instrument in midi_obj.instruments: + if len(instrument.notes) == 0: + continue + if instrument.is_drum: + drum_0_lst.extend(instrument.notes) + else: + new_instruments.append(instrument) + if len(drum_0_lst) > 0: + drum_0_lst.sort(key=lambda x: x.start) + # remove duplicate + drum_0_lst = list(k for k, _ in itertools.groupby(drum_0_lst)) + drum_0_instrument = Instrument(program=114, is_drum=True, name="percussion") + drum_0_instrument.notes = drum_0_lst + new_instruments.append(drum_0_instrument) + midi_obj.instruments = new_instruments + + # referred to mmt "https://github.com/salu133445/mmt" + def _pruning_instrument(self, midi_obj:miditoolkit.midi.parser.MidiFile): + ''' + merge instrument number with similar intrument category + ex. 0: Acoustic Grand Piano, 1: Bright Acoustic Piano, 2: Electric Grand Piano into 0: Acoustic Grand Piano + ''' + new_instruments = [] + for instr in midi_obj.instruments: + instr_idx = instr.program + # change instrument idx + instr_name = PROGRAM_INSTRUMENT_MAP.get(instr_idx) + if instr_name != None: + new_instruments.append(instr) + midi_obj.instruments = new_instruments + + # refered to SymphonyNet "https://github.com/symphonynet/SymphonyNet" + def _limit_max_track(self, midi_obj:miditoolkit.midi.parser.MidiFile, MAX_TRACK:int=16): + ''' + merge track with least notes to other track with same program + and limit the maximum amount of track to 16 + ''' + if len(midi_obj.instruments) == 1: + if midi_obj.instruments[0].is_drum: + midi_obj.instruments[0].program = 114 + midi_obj.instruments[0].is_drum = False + return midi_obj + good_instruments = midi_obj.instruments + good_instruments.sort( + key=lambda x: (not x.is_drum, -len(x.notes))) # place drum track or the most note track at first + assert good_instruments[0].is_drum == True or len(good_instruments[0].notes) >= len( + good_instruments[1].notes), tuple(len(x.notes) for x in good_instruments[:3]) + # assert good_instruments[0].is_drum == False, (, len(good_instruments[2])) + track_idx_lst = list(range(len(good_instruments))) + if len(good_instruments) > MAX_TRACK: + new_good_instruments = copy.deepcopy(good_instruments[:MAX_TRACK]) + # print(midi_file_path) + for id in track_idx_lst[MAX_TRACK:]: + cur_ins = good_instruments[id] + merged = False + new_good_instruments.sort(key=lambda x: len(x.notes)) + for nid, ins in enumerate(new_good_instruments): + if cur_ins.program == ins.program and cur_ins.is_drum == ins.is_drum: + new_good_instruments[nid].notes.extend(cur_ins.notes) + merged = True + break + if not merged: + pass + good_instruments = new_good_instruments + + assert len(good_instruments) <= MAX_TRACK, len(good_instruments) + for idx, good_instrument in enumerate(good_instruments): + if good_instrument.is_drum: + good_instruments[idx].program = 114 + good_instruments[idx].is_drum = False + midi_obj.instruments = good_instruments + + def _pruning_notes_for_chord_extraction(self, midi_obj:miditoolkit.midi.parser.MidiFile): + ''' + extract notes for chord extraction + ''' + new_midi_obj = miditoolkit.midi.parser.MidiFile() + new_midi_obj.ticks_per_beat = midi_obj.ticks_per_beat + new_midi_obj.max_tick = midi_obj.max_tick + new_instrument = Instrument(program=0, is_drum=False, name="for_chord") + new_instruments = [] + new_notes = [] + for instrument in midi_obj.instruments: + if instrument.program == 114 or instrument.is_drum: # pass drum track + continue + valid_notes = [note for note in instrument.notes if note.pitch >= 21 and note.pitch <= 108] + new_notes.extend(valid_notes) + new_notes.sort(key=lambda x: x.start) + new_instrument.notes = new_notes + new_instruments.append(new_instrument) + new_midi_obj.instruments = new_instruments + return new_midi_obj + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dataset", + required=True, + # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"), + type=str, + help="dataset names", + ) + parser.add_argument( + "-f", + "--num_features", + required=True, + choices=(4, 5, 7, 8), + type=int, + help="number of features", + ) + parser.add_argument( + "-i", + "--in_dir", + default="../dataset/", + type=Path, + help="input data directory", + ) + parser.add_argument( + "-o", + "--out_dir", + default="../dataset/represented_data/corpus/", + type=Path, + help="output data directory", + ) + parser.add_argument( + "--debug", + action="store_true", + help="enable debug mode", + ) + return parser + +def main(): + parser = get_argument_parser() + args = parser.parse_args() + corpus_maker = CorpusMaker(args.dataset, args.num_features, args.in_dir, args.out_dir, args.debug) + corpus_maker.make_corpus() + +if __name__ == "__main__": + main() +# python3 step1_midi2corpus.py --dataset SOD --num_features 5 +# python3 step2_corpus2event.py --dataset LakhClean --num_features 5 --encoding nb +# python3 step3_creating_vocab.py --dataset SOD --num_features 5 --encoding nb +# python3 step4_event2tuneidx.py --dataset SOD --num_features 5 --encoding nb \ No newline at end of file diff --git a/data_representation/step1_midi2corpus_fined.py b/data_representation/step1_midi2corpus_fined.py new file mode 100644 index 0000000..e42cbfa --- /dev/null +++ b/data_representation/step1_midi2corpus_fined.py @@ -0,0 +1,654 @@ +import argparse +import time +import itertools +import copy +from copy import deepcopy +from pathlib import Path +from multiprocessing import Pool, cpu_count +from collections import defaultdict +from fractions import Fraction +from typing import List +import os +from muspy import sort +import numpy as np +import pickle +from tqdm import tqdm + +import miditoolkit +from miditoolkit.midi.containers import Marker, Instrument +from chorder import Dechorder + +from constants import NUM2PITCH,FINED_PROGRAM_INSTRUMENT_MAP, INSTRUMENT_PROGRAM_MAP + +''' +This script is designed to preprocess MIDI files and convert them into a structured corpus suitable for symbolic music analysis or model training. +It handles various tasks, including setting beat resolution, calculating duration, velocity, and tempo bins, and processing MIDI data into quantized musical events. +We dont do instrument merging here. +''' + +def get_tempo_bin(max_tempo:int, ratio:float=1.1): + bpm = 30 + regular_tempo_bins = [bpm] + while bpm < max_tempo: + bpm *= ratio + bpm = round(bpm) + if bpm > max_tempo: + break + regular_tempo_bins.append(bpm) + return np.array(regular_tempo_bins) + +def split_markers(markers:List[miditoolkit.midi.containers.Marker]): + ''' + split markers into chord, tempo, label + ''' + chords = [] + for marker in markers: + splitted_text = marker.text.split('_') + if splitted_text[0] != 'global' and 'Boundary' not in splitted_text[0]: + chords.append(marker) + return chords + +class CorpusMaker(): + def __init__( + self, + dataset_name:str, + num_features:int, + in_dir:Path, + out_dir:Path, + debug:bool + ): + ''' + Initialize the CorpusMaker with dataset information and directory paths. + It sets up MIDI paths, output directories, and debug mode, then + retrieves the beat resolution, duration bins, velocity/tempo bins, and prepares the MIDI file list. + ''' + self.dataset_name = dataset_name + self.num_features = num_features + self.midi_path = in_dir / f"{dataset_name}" + self.out_dir = out_dir + self.debug = debug + self._get_in_beat_resolution() + self._get_duration_bins() + self._get_velocity_tempo_bins() + self._get_min_max_last_time() + self._prepare_midi_list() + + def _get_in_beat_resolution(self): + # Retrieve the resolution of quarter note based on the dataset name (e.g., 4 means the minimum resolution sets to 16th note) + in_beat_resolution_dict = {'BachChorale': 4, 'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4, 'SymphonyMIDI': 8} + try: + self.in_beat_resolution = in_beat_resolution_dict[self.dataset_name] + except KeyError: + print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean") + self.in_beat_resolution = in_beat_resolution_dict['LakhClean'] + + def _get_duration_bins(self): + # Set up regular duration bins for quantizing note lengths, based on the beat resolution. + base_duration = {4:[1,2,3,4,5,6,8,10,12,16,20,24,28,32], + 8:[1,2,3,4,6,8,10,12,14,16,20,24,28,32,36,40,48,56,64], + 12:[1,2,3,4,6,9,12,15,18,24,30,36,42,48,54,60,72,84,96]} + base_duration_list = base_duration[self.in_beat_resolution] + self.regular_duration_bins = np.array(base_duration_list) + + def _get_velocity_tempo_bins(self): + # Define velocity and tempo bins based on whether the dataset is a performance or score type. + midi_type_dict = {'BachChorale': 'score', 'Pop1k7': 'perform', 'Pop909': 'score', 'SOD': 'score', 'LakhClean': 'score', 'Symphony': 'score'} + try: + midi_type = midi_type_dict[self.dataset_name] + except KeyError: + print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean") + midi_type = midi_type_dict['LakhClean'] + # For performance-type datasets, set finer granularity of velocity and tempo bins. + if midi_type == 'perform': + self.regular_velocity_bins = np.array(list(range(40, 128, 8)) + [127]) + self.regular_tempo_bins = get_tempo_bin(max_tempo=240, ratio=1.04) + # For score-type datasets, use coarser velocity and tempo bins. + elif midi_type == 'score': + self.regular_velocity_bins = np.array([40, 60, 80, 100, 120]) + self.regular_tempo_bins = get_tempo_bin(max_tempo=390, ratio=1.04) + + def _get_min_max_last_time(self): + ''' + Set the minimum and maximum allowed length of a MIDI track, depending on the dataset. + 0 to 2000 means no limitation + ''' + # last_time_dict = {'BachChorale': (0, 2000), 'Pop1k7': (0, 2000), 'Pop909': (0, 2000), 'SOD': (60, 1000), 'LakhClean': (60, 600), 'Symphony': (60, 1500)} + last_time_dict = {'BachChorale': (0, 2000), 'Pop1k7': (0, 2000), 'Pop909': (0, 2000), 'SOD': (60, 1000), 'LakhClean': (0, 2000), 'Symphony': (60, 1500)} + try: + self.min_last_time, self.max_last_time = last_time_dict[self.dataset_name] + except KeyError: + print(f"Dataset {self.dataset_name} is not supported. use the setting of LakhClean") + self.min_last_time, self.max_last_time = last_time_dict['LakhClean'] + + def _prepare_midi_list(self): + midi_path = Path(self.midi_path) + # detect subdirectories and get all midi files + if not midi_path.exists(): + raise ValueError(f"midi_path {midi_path} does not exist") + # go though all subdirectories and get all midi files + midi_files = [] + for root, _, files in os.walk(midi_path): + for file in files: + if file.endswith('.mid') or file.endswith('.midi') or file.endswith('.MID'): + # print(Path(root) / file) + midi_files.append(Path(root) / file) + self.midi_list = midi_files + print(f"Found {len(self.midi_list)} MIDI files in {midi_path}") + + def make_corpus(self) -> None: + ''' + Main method to process the MIDI files and create the corpus data. + It supports both single-processing (debug mode) and multi-processing for large datasets. + ''' + print("preprocessing midi data to corpus data") + # check the corpus folder is already exist and make it if not + Path(self.out_dir).mkdir(parents=True, exist_ok=True) + Path(self.out_dir / f"corpus_{self.dataset_name}").mkdir(parents=True, exist_ok=True) + Path(self.out_dir / f"midi_{self.dataset_name}").mkdir(parents=True, exist_ok=True) + start_time = time.time() + if self.debug: + # single processing for debugging + broken_counter = 0 + success_counter = 0 + for file_path in tqdm(self.midi_list, total=len(self.midi_list)): + message = self._mp_midi2corpus(file_path) + if message == "error": + broken_counter += 1 + elif message == "success": + success_counter += 1 + else: + # Multi-threaded processing for faster corpus generation. + broken_counter = 0 + success_counter = 0 + # filter out processed files + print(self.out_dir) + processed_files = list(Path(self.out_dir).glob(f"midi_{self.dataset_name}/*.mid")) + processed_files = [x.name for x in processed_files] + print(f"processed files: {len(processed_files)}") + print("length of midi list: ", len(self.midi_list)) + # Use set for faster lookup (O(1) per check) + processed_files_set = set(processed_files) + # self.midi_list = [x for x in self.midi_list if x.name not in processed_files_set] + # reverse the list to process the latest files first + self.midi_list.reverse() + print(f"length of midi list after filtering: ", len(self.midi_list)) + with Pool(16) as p: + for message in tqdm(p.imap(self._mp_midi2corpus, self.midi_list, 500), total=len(self.midi_list)): + if message == "error": + broken_counter += 1 + elif message == "success": + success_counter += 1 + # for file_path in tqdm(self.midi_list, total=len(self.midi_list)): + # message = self._mp_midi2corpus(file_path) + # if message == "error": + # broken_counter += 1 + # elif message == "success": + # success_counter += 1 + print(f"Making corpus takes: {time.time() - start_time}s, success: {success_counter}, broken: {broken_counter}") + + def _mp_midi2corpus(self, file_path: Path): + """Convert MIDI to corpus format and save both corpus (.pkl) and MIDI (.mid).""" + try: + midi_obj = self._analyze(file_path) + corpus, midi_obj = self._midi2corpus(midi_obj) + # --- 1. Save corpus (.pkl) --- + relative_path = file_path.relative_to(self.midi_path) # Get relative path from input dir + safe_name = str(relative_path).replace("/", "_").replace("\\", "_").replace(".mid", ".pkl") + save_path = Path(self.out_dir) / f"corpus_{self.dataset_name}" / safe_name + save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure dir exists + with save_path.open("wb") as f: + pickle.dump(corpus, f) + + # --- 2. Save MIDI (.mid) --- + midi_save_dir = Path("../dataset/represented_data/corpus") / f"midi_{self.dataset_name}" + midi_save_dir.mkdir(parents=True, exist_ok=True) + midi_save_path = midi_save_dir / file_path.name # Keep original MIDI filename + midi_obj.dump(midi_save_path) + + del midi_obj, corpus + return "success" + + except (OSError, EOFError, ValueError, KeyError, AssertionError) as e: + print(f"Error processing {file_path.name}: {e}") + return "error" + except Exception as e: + print(f"Unexpected error in {file_path.name}: {e}") + return "error" + def _check_length(self, last_time:float): + if last_time < self.min_last_time: + raise ValueError(f"last time {last_time} is out of range") + + def _analyze(self, midi_path:Path): + # Loads and analyzes a MIDI file, performing various checks and extracting chords. + midi_obj = miditoolkit.midi.parser.MidiFile(midi_path) + + # check length + mapping = midi_obj.get_tick_to_time_mapping() + last_time = mapping[midi_obj.max_tick] + self._check_length(last_time) + + for ins in midi_obj.instruments: + # delete instrument with no notes + if len(ins.notes) == 0: + midi_obj.instruments.remove(ins) + continue + notes = ins.notes + notes = sorted(notes, key=lambda x: (x.start, x.pitch)) + + # three steps to merge instruments + self._merge_percussion(midi_obj) + # self._pruning_instrument(midi_obj) + self._limit_max_track(midi_obj) + + if self.num_features == 7 or self.num_features == 8: + # in case of 7 or 8 features, we need to extract chords + new_midi_obj = self._pruning_notes_for_chord_extraction(midi_obj) + chords = Dechorder.dechord(new_midi_obj) + markers = [] + for cidx, chord in enumerate(chords): + if chord.is_complete(): + chord_text = NUM2PITCH[chord.root_pc] + '_' + chord.quality + '_' + NUM2PITCH[chord.bass_pc] + else: + chord_text = 'N_N_N' + markers.append(Marker(time=int(cidx*new_midi_obj.ticks_per_beat), text=chord_text)) + + # de-duplication + prev_chord = None + dedup_chords = [] + for m in markers: + if m.text != prev_chord: + prev_chord = m.text + dedup_chords.append(m) + + # return midi + midi_obj.markers = dedup_chords + return midi_obj + + def _pruning_grouped_notes_from_quantization(self, instr_grid:dict): + ''' + In case where notes are grouped in the same quant_time but with different start time, unintentional chords are created + rule1: if notes have half step interval, delete the shorter one + rule2: if notes do not share 50% of duration of the shorter note, delete the shorter one + ''' + for instr in instr_grid.keys(): + time_list = sorted(list(instr_grid[instr].keys())) + for time in time_list: + notes = instr_grid[instr][time] + if len(notes) == 1: + continue + else: + new_notes = [] + # sort in pitch with ascending order + notes.sort(key=lambda x: x.pitch) + for i in range(len(notes)-1): + # if start time is same add to new_notes + if notes[i].start == notes[i+1].start: + new_notes.append(notes[i]) + new_notes.append(notes[i+1]) + continue + if notes[i].pitch == notes[i+1].pitch or notes[i].pitch + 1 == notes[i+1].pitch: + # select longer note + if notes[i].end - notes[i].start > notes[i+1].end - notes[i+1].start: + new_notes.append(notes[i]) + else: + new_notes.append(notes[i+1]) + else: + # check how much duration they share + shared_duration = min(notes[i].end, notes[i+1].end) - max(notes[i].start, notes[i+1].start) + shorter_duration = min(notes[i].end - notes[i].start, notes[i+1].end - notes[i+1].start) + # unless they share more than 80% of duration, select longer note (pruning shorter note) + if shared_duration / shorter_duration < 0.8: + if notes[i].end - notes[i].start > notes[i+1].end - notes[i+1].start: + new_notes.append(notes[i]) + else: + new_notes.append(notes[i+1]) + else: + if len(new_notes) == 0: + new_notes.append(notes[i]) + new_notes.append(notes[i+1]) + else: + new_notes.append(notes[i+1]) + instr_grid[instr][time] = new_notes + + def _midi2corpus(self, midi_obj:miditoolkit.midi.parser.MidiFile): + # Checks if the ticks per beat in the MIDI file is lower than the expected resolution. + # If it is, raise an error. + if midi_obj.ticks_per_beat < self.in_beat_resolution: + raise ValueError(f'[x] Irregular ticks_per_beat. {midi_obj.ticks_per_beat}') + + # Ensure there is at least one time signature change in the MIDI file. + # if len(midi_obj.time_signature_changes) == 0: + # raise ValueError('[x] No time_signature_changes') + + # Ensure there are no duplicated time signature changes. + # time_list = [ts.time for ts in midi_obj.time_signature_changes] + # if len(time_list) != len(set(time_list)): + # raise ValueError('[x] Duplicated time_signature_changes') + + # If the dataset is 'LakhClean' or 'SymphonyMIDI', verify there are at least 4 tracks. + # if self.dataset_name == 'LakhClean' or self.dataset_name == 'SymphonyMIDI': + # if len(midi_obj.instruments) < 4: + # raise ValueError('[x] We will use more than 4 tracks in Lakh Clean dataset.') + + # Calculate the resolution of ticks per beat as a fraction. + in_beat_tick_resol = Fraction(midi_obj.ticks_per_beat, self.in_beat_resolution) + + # Extract the initial time signature (numerator and denominator) and calculate the number of ticks for the first bar. + if len(midi_obj.time_signature_changes) != 0: + initial_numerator = midi_obj.time_signature_changes[0].numerator + initial_denominator = midi_obj.time_signature_changes[0].denominator + else: + # If no time signature changes, set default values + initial_numerator = 4 + initial_denominator = 4 + first_bar_resol = int(midi_obj.ticks_per_beat * initial_numerator * (4 / initial_denominator)) + + # --- load notes --- # + instr_notes = self._make_instr_notes(midi_obj) + # --- load information --- # + # load chords, labels + chords = split_markers(midi_obj.markers) + chords.sort(key=lambda x: x.time) + + + # load tempos + tempos = midi_obj.tempo_changes if len(midi_obj.tempo_changes) > 0 else [] + if len(tempos) == 0: + # if no tempo changes, set the default tempo to 120 BPM + tempos = [miditoolkit.midi.containers.TempoChange(time=0, tempo=120)] + tempos.sort(key=lambda x: x.time) + + # --- process items to grid --- # + # compute empty bar offset at head + first_note_time = min([instr_notes[k][0].start for k in instr_notes.keys()]) + last_note_time = max([instr_notes[k][-1].start for k in instr_notes.keys()]) + + quant_time_first = int(round(first_note_time / in_beat_tick_resol)) * in_beat_tick_resol + offset = quant_time_first // first_bar_resol # empty bar + offset_by_resol = offset * first_bar_resol + # --- process notes --- # + instr_grid = dict() + for key in instr_notes.keys(): + notes = instr_notes[key] + note_grid = defaultdict(list) + for note in notes: + # skip notes out of range, below C-1 and above C8 + if note.pitch < 12 or note.pitch >= 120: + continue + + # in case when the first note starts at slightly before the first bar + note.start = note.start - offset_by_resol if note.start - offset_by_resol > 0 else 0 + note.end = note.end - offset_by_resol if note.end - offset_by_resol > 0 else 0 + + # relative duration + # skip note with 0 duration + note_duration = note.end - note.start + relative_duration = round(note_duration / in_beat_tick_resol) + if relative_duration == 0: + continue + if relative_duration > self.in_beat_resolution * 8: # 8 beats + relative_duration = self.in_beat_resolution * 8 + + # use regular duration bins + note.quantized_duration = self.regular_duration_bins[np.argmin(abs(self.regular_duration_bins-relative_duration))] + + # quantize start time + quant_time = int(round(note.start / in_beat_tick_resol)) * in_beat_tick_resol + + # velocity + note.velocity = self.regular_velocity_bins[ + np.argmin(abs(self.regular_velocity_bins-note.velocity))] + + # append + note_grid[quant_time].append(note) + + # set to track + instr_grid[key] = note_grid + + # --- pruning grouped notes --- # + self._pruning_grouped_notes_from_quantization(instr_grid) + + # --- process chords --- # + chord_grid = defaultdict(list) + for chord in chords: + # quantize + chord.time = chord.time - offset_by_resol + chord.time = 0 if chord.time < 0 else chord.time + quant_time = int(round(chord.time / in_beat_tick_resol)) * in_beat_tick_resol + chord_grid[quant_time].append(chord) + + # --- process tempos --- # + + first_notes_list = [] + for instr in instr_grid.keys(): + time_list = sorted(list(instr_grid[instr].keys())) + if len(time_list) == 0: # 跳过空轨道 + continue + first_quant_time = time_list[0] + first_notes_list.append(first_quant_time) + + # 处理全空情况 + if not first_notes_list: + raise ValueError("[x] No valid notes found in any instrument track.") + quant_first_note_time = min(first_notes_list) + tempo_grid = defaultdict(list) + for tempo in tempos: + # quantize + tempo.time = tempo.time - offset_by_resol if tempo.time - offset_by_resol > 0 else 0 + quant_time = int(round(tempo.time / in_beat_tick_resol)) * in_beat_tick_resol + tempo.tempo = self.regular_tempo_bins[ + np.argmin(abs(self.regular_tempo_bins-tempo.tempo))] + if quant_time < quant_first_note_time: + tempo_grid[quant_first_note_time].append(tempo) + else: + tempo_grid[quant_time].append(tempo) + if len(tempo_grid[quant_first_note_time]) > 1: + tempo_grid[quant_first_note_time] = [tempo_grid[quant_first_note_time][-1]] + # --- process time signature --- # + quant_time_signature = deepcopy(midi_obj.time_signature_changes) + quant_time_signature.sort(key=lambda x: x.time) + for ts in quant_time_signature: + ts.time = ts.time - offset_by_resol if ts.time - offset_by_resol > 0 else 0 + ts.time = int(round(ts.time / in_beat_tick_resol)) * in_beat_tick_resol + + # --- make new midi object to check processed values --- # + new_midi_obj = miditoolkit.midi.parser.MidiFile() + new_midi_obj.ticks_per_beat = midi_obj.ticks_per_beat + new_midi_obj.max_tick = midi_obj.max_tick + for instr_idx in instr_grid.keys(): + new_instrument = Instrument(program=instr_idx) + new_instrument.notes = [y for x in instr_grid[instr_idx].values() for y in x] + new_midi_obj.instruments.append(new_instrument) + new_midi_obj.markers = [y for x in chord_grid.values() for y in x] + new_midi_obj.tempo_changes = [y for x in tempo_grid.values() for y in x] + new_midi_obj.time_signature_changes = midi_obj.time_signature_changes + + # make corpus + song_data = { + 'notes': instr_grid, + 'chords': chord_grid, + 'tempos': tempo_grid, + 'metadata': { + 'first_note': first_note_time, + 'last_note': last_note_time, + 'time_signature': quant_time_signature, + 'ticks_per_beat': midi_obj.ticks_per_beat, + } + } + return song_data, new_midi_obj + + def _make_instr_notes(self, midi_obj): + ''' + This part is important, we can use three different ways to merge instruments + 1st option: compare the number of notes and choose tracks with more notes + 2nd option: merge all instruments with the same tracks + 3rd option: leave all instruments as they are. differentiate tracks with different track number + + In this version we choose to use the 2nd option as it helps to reduce the number of tracks and sequence length + ''' + instr_notes = defaultdict(list) + for instr in midi_obj.instruments: + instr_idx = instr.program + # change instrument idx + instr_name = FINED_PROGRAM_INSTRUMENT_MAP.get(instr_idx) + if instr_name is None: + continue + # new_instr_idx = INSTRUMENT_PROGRAM_MAP[instr_name] + new_instr_idx = instr_idx + if new_instr_idx not in instr_notes: + instr_notes[new_instr_idx] = [] + instr_notes[new_instr_idx].extend(instr.notes) + instr_notes[new_instr_idx].sort(key=lambda x: (x.start, -x.pitch)) + return instr_notes + + # refered to SymphonyNet "https://github.com/symphonynet/SymphonyNet" + def _merge_percussion(self, midi_obj:miditoolkit.midi.parser.MidiFile): + ''' + merge drum track to one track + ''' + drum_0_lst = [] + new_instruments = [] + for instrument in midi_obj.instruments: + if len(instrument.notes) == 0: + continue + if instrument.is_drum: + drum_0_lst.extend(instrument.notes) + else: + new_instruments.append(instrument) + if len(drum_0_lst) > 0: + drum_0_lst.sort(key=lambda x: x.start) + # remove duplicate + drum_0_lst = list(k for k, _ in itertools.groupby(drum_0_lst)) + drum_0_instrument = Instrument(program=114, is_drum=True, name="percussion") + drum_0_instrument.notes = drum_0_lst + new_instruments.append(drum_0_instrument) + midi_obj.instruments = new_instruments + + # referred to mmt "https://github.com/salu133445/mmt" + def _pruning_instrument(self, midi_obj:miditoolkit.midi.parser.MidiFile): + ''' + merge instrument number with similar intrument category + ex. 0: Acoustic Grand Piano, 1: Bright Acoustic Piano, 2: Electric Grand Piano into 0: Acoustic Grand Piano + ''' + new_instruments = [] + for instr in midi_obj.instruments: + instr_idx = instr.program + # change instrument idx + instr_name = PROGRAM_INSTRUMENT_MAP.get(instr_idx) + if instr_name != None: + new_instruments.append(instr) + midi_obj.instruments = new_instruments + + # refered to SymphonyNet "https://github.com/symphonynet/SymphonyNet" + def _limit_max_track(self, midi_obj:miditoolkit.midi.parser.MidiFile, MAX_TRACK:int=16): + ''' + merge track with least notes to other track with same program + and limit the maximum amount of track to 16 + ''' + if len(midi_obj.instruments) == 1: + if midi_obj.instruments[0].is_drum: + midi_obj.instruments[0].program = 114 + midi_obj.instruments[0].is_drum = False + return midi_obj + good_instruments = midi_obj.instruments + good_instruments.sort( + key=lambda x: (not x.is_drum, -len(x.notes))) # place drum track or the most note track at first + assert good_instruments[0].is_drum == True or len(good_instruments[0].notes) >= len( + good_instruments[1].notes), tuple(len(x.notes) for x in good_instruments[:3]) + # assert good_instruments[0].is_drum == False, (, len(good_instruments[2])) + track_idx_lst = list(range(len(good_instruments))) + if len(good_instruments) > MAX_TRACK: + new_good_instruments = copy.deepcopy(good_instruments[:MAX_TRACK]) + # print(midi_file_path) + for id in track_idx_lst[MAX_TRACK:]: + cur_ins = good_instruments[id] + merged = False + new_good_instruments.sort(key=lambda x: len(x.notes)) + for nid, ins in enumerate(new_good_instruments): + if cur_ins.program == ins.program and cur_ins.is_drum == ins.is_drum: + new_good_instruments[nid].notes.extend(cur_ins.notes) + merged = True + break + if not merged: + pass + good_instruments = new_good_instruments + + assert len(good_instruments) <= MAX_TRACK, len(good_instruments) + for idx, good_instrument in enumerate(good_instruments): + if good_instrument.is_drum: + good_instruments[idx].program = 114 + good_instruments[idx].is_drum = False + midi_obj.instruments = good_instruments + + def _pruning_notes_for_chord_extraction(self, midi_obj:miditoolkit.midi.parser.MidiFile): + ''' + extract notes for chord extraction + ''' + new_midi_obj = miditoolkit.midi.parser.MidiFile() + new_midi_obj.ticks_per_beat = midi_obj.ticks_per_beat + new_midi_obj.max_tick = midi_obj.max_tick + new_instrument = Instrument(program=0, is_drum=False, name="for_chord") + new_instruments = [] + new_notes = [] + for instrument in midi_obj.instruments: + if instrument.program == 114 or instrument.is_drum: # pass drum track + continue + valid_notes = [note for note in instrument.notes if note.pitch >= 21 and note.pitch <= 108] + new_notes.extend(valid_notes) + new_notes.sort(key=lambda x: x.start) + new_instrument.notes = new_notes + new_instruments.append(new_instrument) + new_midi_obj.instruments = new_instruments + return new_midi_obj + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dataset", + required=True, + # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"), + type=str, + help="dataset names", + ) + parser.add_argument( + "-f", + "--num_features", + required=True, + choices=(4, 5, 7, 8), + type=int, + help="number of features", + ) + parser.add_argument( + "-i", + "--in_dir", + default="../dataset/", + type=Path, + help="input data directory", + ) + parser.add_argument( + "-o", + "--out_dir", + default="../dataset/represented_data/corpus/", + type=Path, + help="output data directory", + ) + parser.add_argument( + "--debug", + action="store_true", + help="enable debug mode", + ) + return parser + +def main(): + parser = get_argument_parser() + args = parser.parse_args() + corpus_maker = CorpusMaker(args.dataset, args.num_features, args.in_dir, args.out_dir, args.debug) + corpus_maker.make_corpus() + +if __name__ == "__main__": + main() +# python3 step1_midi2corpus.py --dataset SOD --num_features 5 +# python3 step2_corpus2event.py --dataset LakhClean --num_features 5 --encoding nb +# python3 step3_creating_vocab.py --dataset SOD --num_features 5 --encoding nb +# python3 step4_event2tuneidx.py --dataset SOD --num_features 5 --encoding nb \ No newline at end of file diff --git a/data_representation/step2_corpus2event.py b/data_representation/step2_corpus2event.py new file mode 100644 index 0000000..74b2b55 --- /dev/null +++ b/data_representation/step2_corpus2event.py @@ -0,0 +1,147 @@ +import argparse +import time +from pathlib import Path + +import pickle +from tqdm import tqdm +from multiprocessing import Pool + +import encoding_utils + +''' +This script is for converting corpus data to event data. +''' + +class Corpus2Event(): + def __init__( + self, + dataset: str, + encoding_scheme: str, + num_features: int, + in_dir: Path, + out_dir: Path, + debug: bool, + cache: bool, + ): + self.dataset = dataset + self.encoding_name = encoding_scheme + str(num_features) + self.in_dir = in_dir / f"corpus_{self.dataset}" + self.out_dir = out_dir / f"events_{self.dataset}" / self.encoding_name + self.debug = debug + self.cache = cache + self.encoding_function = getattr(encoding_utils, f'Corpus2event_{encoding_scheme}')(num_features) + self._get_in_beat_resolution() + + def _get_in_beat_resolution(self): + # Retrieve the resolution of quarter note based on the dataset name (e.g., 4 means the minimum resolution sets to 16th note) + in_beat_resolution_dict = {'BachChorale': 4, 'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4, 'SymphonyMIDI': 8} + try: + self.in_beat_resolution = in_beat_resolution_dict[self.dataset] + except KeyError: + print(f"Dataset {self.dataset} is not supported. use the setting of LakhClean") + self.in_beat_resolution = in_beat_resolution_dict['LakhClean'] + + def make_events(self): + ''' + Preprocess corpus data to events data. + The process in each encoding scheme is different. + Please refer to encoding_utils.py for more details. + ''' + print("preprocessing corpus data to events data") + # check output directory exists + self.out_dir.mkdir(parents=True, exist_ok=True) + start_time = time.time() + # single-processing + broken_count = 0 + success_count = 0 + corpus_list = sorted(list(self.in_dir.rglob("*.pkl"))) + if corpus_list == []: + print(f"No corpus files found in {self.in_dir}. Please check the directory.") + corpus_list = sorted(list(self.in_dir.glob("*.pkli"))) + # remove the corpus files that are already in the out_dir + # Use set for faster existence checks + existing_files = set(f.name for f in self.out_dir.glob("*.pkl")) + # corpus_list = [corpus for corpus in corpus_list if corpus.name not in existing_files] + for filepath_name, event in tqdm(map(self._load_single_corpus_and_make_event, corpus_list), total=len(corpus_list)): + if event is None: + broken_count += 1 + continue + # if using cache, check if the event file already exists + if self.cache and (self.out_dir / filepath_name).exists(): + # print(f"event file {filepath_name} already exists, skipping") + continue + with open(self.out_dir / filepath_name, 'wb') as f: + pickle.dump(event, f) + success_count += 1 + del event + print(f"taken time for making events is {time.time()-start_time}s, success: {success_count}, broken: {broken_count}") + + def _load_single_corpus_and_make_event(self, file_path): + try: + with open(file_path, 'rb') as f: + corpus = pickle.load(f) + event = self.encoding_function(corpus, self.in_beat_resolution) + except Exception as e: + print(f"error in encoding {file_path}: {e}") + event = None + return file_path.name, event + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dataset", + required=True, + # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"), + type=str, + help="dataset names", + ) + parser.add_argument( + "-e", + "--encoding", + required=True, + choices=("remi", "cp", "nb", "remi_pos"), + type=str, + help="encoding scheme", + ) + parser.add_argument( + "-f", + "--num_features", + required=True, + choices=(4, 5, 7, 8), + type=int, + help="number of features", + ) + parser.add_argument( + "-i", + "--in_dir", + default="../dataset/represented_data/corpus/", + type=Path, + help="input data directory", + ) + parser.add_argument( + "-o", + "--out_dir", + default="../dataset/represented_data/events/", + type=Path, + help="output data directory", + ) + parser.add_argument( + "--debug", + action="store_true", + help="enable debug mode", + ) + parser.add_argument( + "--cache", + action="store_true", + help="enable cache mode", + ) + return parser + +def main(): + args = get_argument_parser().parse_args() + corpus2event = Corpus2Event(args.dataset, args.encoding, args.num_features, args.in_dir, args.out_dir, args.debug, args.cache) + corpus2event.make_events() + +if __name__ == "__main__": + main() diff --git a/data_representation/step3_creating_vocab.py b/data_representation/step3_creating_vocab.py new file mode 100644 index 0000000..fa8cc33 --- /dev/null +++ b/data_representation/step3_creating_vocab.py @@ -0,0 +1,84 @@ +import argparse +from pathlib import Path + +import vocab_utils + +''' +This script is for creating vocab file. +''' + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dataset", + required=True, + # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"), + type=str, + help="dataset names", + ) + parser.add_argument( + "-e", + "--encoding", + required=True, + choices=("remi", "cp", "nb"), + type=str, + help="encoding scheme", + ) + parser.add_argument( + "-f", + "--num_features", + required=True, + choices=(4, 5, 7, 8), + type=int, + help="number of features", + ) + parser.add_argument( + "-i", + "--in_dir", + default="../dataset/represented_data/events/", + type=Path, + help="input data directory", + ) + parser.add_argument( + "-o", + "--out_dir", + default="../vocab/", + type=Path, + help="output data directory", + ) + parser.add_argument( + "--debug", + action="store_true", + help="enable debug mode", + ) + return parser + +def main(): + args = get_argument_parser().parse_args() + encoding_scheme = args.encoding + num_features = args.num_features + dataset = args.dataset + + out_vocab_path = args.out_dir / f"vocab_{dataset}" + out_vocab_path.mkdir(parents=True, exist_ok=True) + out_vocab_file_path = out_vocab_path / f"vocab_{dataset}_{encoding_scheme}{num_features}.json" + + events_path = Path(args.in_dir / f"events_{dataset}" / f"{encoding_scheme}{num_features}") + vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'} + selected_vocab_name = vocab_name[encoding_scheme] + event_data = sorted(list(events_path.rglob("*.pkl"))) + if event_data == []: + print(f"No event files found in {events_path}. Please check the directory.") + event_data = sorted(list(events_path.glob("*.pkli"))) + vocab = getattr(vocab_utils, selected_vocab_name)( + in_vocab_file_path=None, + event_data=event_data, + encoding_scheme=encoding_scheme, + num_features=num_features + ) + vocab.save_vocab(out_vocab_file_path) + print(f"Vocab file saved at {out_vocab_file_path}") + +if __name__ == "__main__": + main() diff --git a/data_representation/step4_event2tuneidx.py b/data_representation/step4_event2tuneidx.py new file mode 100644 index 0000000..e429265 --- /dev/null +++ b/data_representation/step4_event2tuneidx.py @@ -0,0 +1,127 @@ +import argparse +import time +from pathlib import Path + +import numpy as np +import pickle +from tqdm import tqdm + +import vocab_utils + +class Event2tuneidx(): + def __init__( + self, + dataset: str, + encoding_scheme: str, + num_features: int, + in_dir: Path, + out_dir: Path, + debug: bool + ): + self.dataset = dataset + self.encoding_scheme = encoding_scheme + self.encoding_name = encoding_scheme + str(num_features) + self.in_dir = in_dir / f"events_{self.dataset}" / self.encoding_name + self.out_dir = out_dir / f"tuneidx_{self.dataset}" / self.encoding_name + self.debug = debug + + vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'} + selected_vocab_name = vocab_name[encoding_scheme] + in_vocab_file_path = Path(f"../vocab/vocab_{dataset}/vocab_{dataset}_{encoding_scheme}{num_features}.json") + self.vocab = getattr(vocab_utils, selected_vocab_name)(in_vocab_file_path=in_vocab_file_path, event_data=None, + encoding_scheme=encoding_scheme, num_features=num_features) + + def _convert_event_to_tune_in_idx(self, tune_in_event): + tune_in_idx = [] + for event in tune_in_event: + event_in_idx = self.vocab(event) + if event_in_idx != None: + tune_in_idx.append(event_in_idx) + return tune_in_idx + + def _load_single_event_and_make_tune_in_idx(self, file_path): + with open(file_path, 'rb') as f: + tune_in_event = pickle.load(f) + tune_in_idx = self._convert_event_to_tune_in_idx(tune_in_event) + return file_path.name, tune_in_idx + + def make_tune_in_idx(self): + print("preprocessing events data to tune_in_idx data") + # check output directory exists + self.out_dir.mkdir(parents=True, exist_ok=True) + start_time = time.time() + event_list = sorted(list(self.in_dir.rglob("*.pkl"))) + if event_list == []: + event_list = sorted(list(self.in_dir.glob("*.pkli"))) + for filepath_name, tune_in_idx in tqdm(map(self._load_single_event_and_make_tune_in_idx, event_list), total=len(event_list)): + # save tune_in_idx as npz file with uint16 dtype for remi because it has more than 256 tokens + if self.encoding_scheme == 'remi': + tune_in_idx = np.array(tune_in_idx, dtype=np.int16) + else: + tune_in_idx = np.array(tune_in_idx, dtype=np.int16) + if np.max(tune_in_idx) < 256: + tune_in_idx = np.array(tune_in_idx, dtype=np.uint8) + if filepath_name.endswith('.pkli'): + file_name = filepath_name.replace('.pkli', '.npz') + else: + file_name = filepath_name.replace('.pkl', '.npz') + np.savez_compressed(self.out_dir / file_name, tune_in_idx) + del tune_in_idx + print(f"taken time for making tune_in_idx is {time.time()-start_time}") + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dataset", + required=True, + # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"), + type=str, + help="dataset names", + ) + parser.add_argument( + "-e", + "--encoding", + required=True, + choices=("remi", "cp", "nb"), + type=str, + help="encoding scheme", + ) + parser.add_argument( + "-f", + "--num_features", + required=True, + choices=(4, 5, 7, 8), + type=int, + help="number of features", + ) + parser.add_argument( + "-i", + "--in_dir", + default="../dataset/represented_data/events/", + type=Path, + help="input data directory", + ) + parser.add_argument( + "-o", + "--out_dir", + default="../dataset/represented_data/tuneidx/", + type=Path, + help="output data directory", + ) + parser.add_argument( + "--debug", + action="store_true", + help="enable debug mode", + ) + return parser + +def main(): + parser = get_argument_parser() + args = parser.parse_args() + + event2tuneidx = Event2tuneidx(args.dataset, args.encoding, args.num_features, args.in_dir, args.out_dir, args.debug) + event2tuneidx.make_tune_in_idx() + +if __name__ == "__main__": + main() diff --git a/data_representation/step4_event2tuneidx_addprompt.py b/data_representation/step4_event2tuneidx_addprompt.py new file mode 100644 index 0000000..4418f0f --- /dev/null +++ b/data_representation/step4_event2tuneidx_addprompt.py @@ -0,0 +1,122 @@ +import argparse +import time +from pathlib import Path + +import numpy as np +import pickle +from tqdm import tqdm + +import vocab_utils + +class Event2tuneidx(): + def __init__( + self, + dataset: str, + encoding_scheme: str, + num_features: int, + in_dir: Path, + out_dir: Path, + debug: bool + ): + self.dataset = dataset + self.encoding_scheme = encoding_scheme + self.encoding_name = encoding_scheme + str(num_features) + self.in_dir = in_dir / f"events_{self.dataset}" / self.encoding_name + self.out_dir = out_dir / f"tuneidx_{self.dataset}" / self.encoding_name + self.debug = debug + + vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'} + selected_vocab_name = vocab_name[encoding_scheme] + in_vocab_file_path = Path(f"../vocab/vocab_{dataset}/vocab_{dataset}_{encoding_scheme}{num_features}.json") + self.vocab = getattr(vocab_utils, selected_vocab_name)(in_vocab_file_path=in_vocab_file_path, event_data=None, + encoding_scheme=encoding_scheme, num_features=num_features) + + def _convert_event_to_tune_in_idx(self, tune_in_event): + tune_in_idx = [] + for event in tune_in_event: + event_in_idx = self.vocab(event) + if event_in_idx != None: + tune_in_idx.append(event_in_idx) + return tune_in_idx + + def _load_single_event_and_make_tune_in_idx(self, file_path): + with open(file_path, 'rb') as f: + tune_in_event = pickle.load(f) + tune_in_idx = self._convert_event_to_tune_in_idx(tune_in_event) + return file_path.name, tune_in_idx + + def make_tune_in_idx(self): + print("preprocessing events data to tune_in_idx data") + # check output directory exists + self.out_dir.mkdir(parents=True, exist_ok=True) + start_time = time.time() + event_list = sorted(list(self.in_dir.rglob("*.pkl"))) + for filepath_name, tune_in_idx in tqdm(map(self._load_single_event_and_make_tune_in_idx, event_list), total=len(event_list)): + # save tune_in_idx as npz file with uint16 dtype for remi because it has more than 256 tokens + if self.encoding_scheme == 'remi': + tune_in_idx = np.array(tune_in_idx, dtype=np.int16) + else: + tune_in_idx = np.array(tune_in_idx, dtype=np.int16) + if np.max(tune_in_idx) < 256: + tune_in_idx = np.array(tune_in_idx, dtype=np.uint8) + file_name = filepath_name.replace('.pkl', '.npz') + np.savez_compressed(self.out_dir / file_name, tune_in_idx) + del tune_in_idx + print(f"taken time for making tune_in_idx is {time.time()-start_time}") + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--dataset", + required=True, + # choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"), + type=str, + help="dataset names", + ) + parser.add_argument( + "-e", + "--encoding", + required=True, + choices=("remi", "cp", "nb"), + type=str, + help="encoding scheme", + ) + parser.add_argument( + "-f", + "--num_features", + required=True, + choices=(4, 5, 7, 8), + type=int, + help="number of features", + ) + parser.add_argument( + "-i", + "--in_dir", + default="../dataset/represented_data/events/", + type=Path, + help="input data directory", + ) + parser.add_argument( + "-o", + "--out_dir", + default="../dataset/represented_data/tuneidx_withcaption/", + type=Path, + help="output data directory", + ) + parser.add_argument( + "--debug", + action="store_true", + help="enable debug mode", + ) + return parser + +def main(): + parser = get_argument_parser() + args = parser.parse_args() + + event2tuneidx = Event2tuneidx(args.dataset, args.encoding, args.num_features, args.in_dir, args.out_dir, args.debug) + event2tuneidx.make_tune_in_idx() + +if __name__ == "__main__": + main() diff --git a/data_representation/vocab_utils.py b/data_representation/vocab_utils.py new file mode 100644 index 0000000..999385b --- /dev/null +++ b/data_representation/vocab_utils.py @@ -0,0 +1,395 @@ +import pickle +from pathlib import Path +from typing import Union +from multiprocessing import Pool, cpu_count +from collections import defaultdict +from fractions import Fraction + +import torch + +import json +from tqdm import tqdm + +def sort_key(s): + fraction_part = s.split('_')[-1] + numerator, denominator = map(int, fraction_part.split('/')) + # Return a tuple with denominator first, then numerator, both in negative for descending order + return (-denominator, -numerator) + +class LangTokenVocab: + def __init__( + self, + in_vocab_file_path:Union[Path, None], + event_data: list, + encoding_scheme: str, + num_features: int + ): + ''' + Initializes the LangTokenVocab class. + + Args: + in_vocab_file_path (Union[Path, None]): Path to the pre-made vocabulary file (optional). + event_data (list): List of event data to create a vocabulary if no pre-made vocab is provided. + encoding_scheme (str): Encoding scheme to be used (e.g., 'remi', 'cp', 'nb'). + num_features (int): Number of features to be used (e.g., 4, 5, 7, 8). + + Summary: + This class is responsible for handling vocabularies used in language models, especially for REMI encoding. + It supports multiple encoding schemes, creates vocabularies based on event data, handles special tokens (e.g., + start/end of sequence), and manages feature-specific masks. It provides methods for saving, loading, and decoding + vocabularies. It also supports vocabulary augmentation for pitch, instrument, beat, and chord features, ensuring + that these are arranged and ordered appropriately. + + For all encoding schemes, the metric or special tokens are named as 'type', + so that we can easily handle and compare among different encoding schemes. + ''' + + self.encoding_scheme = encoding_scheme + self.num_features = num_features + self._prepare_in_vocab(in_vocab_file_path, event_data) # Prepares initial vocab based on the input file or event data + self._get_features() # Extracts relevant features based on the num_features + self.idx2event, self.event2idx = self._get_vocab(event_data, unique_vocabs=self.idx2event) # Creates vocab or loads premade vocab + if self.encoding_scheme == 'remi': + self._make_mask() # Generates masks for 'remi' encoding scheme + self._get_sos_eos_token() # Retrieves special tokens (Start of Sequence, End of Sequence) + + # Prepares vocabulary if a pre-made vocab file exists or handles cases with no input file. + def _prepare_in_vocab(self, in_vocab_file_path, event_data): + if in_vocab_file_path is not None: + with open(in_vocab_file_path, 'r') as f: + idx2event_temp = json.load(f) + if self.encoding_scheme == 'cp' or self.encoding_scheme == 'nb': + for key in idx2event_temp.keys(): + idx2event_temp[key] = {int(idx):tok for idx, tok in idx2event_temp[key].items()} + elif self.encoding_scheme == 'remi': + idx2event_temp = {int(idx):tok for idx, tok in idx2event_temp.items()} + self.idx2event = idx2event_temp + elif in_vocab_file_path is None and event_data is None: + raise NotImplementedError('either premade vocab or event_data should be given') + else: + self.idx2event = None + + # Extracts features depending on the number of features chosen (4, 5, 7, 8). + def _get_features(self): + feature_args = { + 4: ["type", "beat", "pitch", "duration"], + 5: ["type", "beat", "instrument", "pitch", "duration"], + 7: ["type", "beat", "chord", "tempo", "pitch", "duration", "velocity"], + 8: ["type", "beat", "chord", "tempo", "instrument", "pitch", "duration", "velocity"]} + self.feature_list = feature_args[self.num_features] + + # Saves the current vocabulary to a specified JSON path. + def save_vocab(self, json_path): + with open(json_path, 'w') as f: + json.dump(self.idx2event, f, indent=2, ensure_ascii=False) + + # Returns the size of the current vocabulary. + def get_vocab_size(self): + return len(self.idx2event) + + # Handles Start of Sequence (SOS) and End of Sequence (EOS) tokens based on the encoding scheme. + def _get_sos_eos_token(self): + if self.encoding_scheme == 'remi': + self.sos_token = [self.event2idx['SOS_None']] + self.eos_token = [[self.event2idx['EOS_None']]] + else: + self.sos_token = [[self.event2idx['type']['SOS']] + [0] * (self.num_features - 1)] + self.eos_token = [[self.event2idx['type']['EOS']] + [0] * (self.num_features - 1)] + + # Generates vocabularies by either loading from a file or creating them based on the event data. + def _get_vocab(self, event_data, unique_vocabs=None): + # make new vocab from given event_data + if event_data is not None: + unique_char_list = list(set([f'{event["name"]}_{event["value"]}' for tune_path in event_data for event in pickle.load(open(tune_path, 'rb'))])) + unique_vocabs = sorted(unique_char_list) + unique_vocabs.remove('SOS_None') + unique_vocabs.remove('EOS_None') + unique_vocabs.remove('Bar_None') + new_unique_vocab = self._augment_pitch_vocab(unique_vocabs) + if self.num_features == 5 or self.num_features == 8: + new_unique_vocab = self._arange_instrument_vocab(new_unique_vocab) + if self.num_features == 7 or self.num_features == 8: + new_unique_vocab = self._arange_chord_vocab(new_unique_vocab) + new_unique_vocab = self._arange_beat_vocab(new_unique_vocab) + new_unique_vocab.insert(0, 'SOS_None') + new_unique_vocab.insert(1, 'EOS_None') + new_unique_vocab.insert(2, 'Bar_None') + idx2event = {int(idx) : tok for idx, tok in enumerate(new_unique_vocab)} + event2idx = {tok : int(idx) for idx, tok in idx2event.items()} + # load premade vocab + else: + idx2event = unique_vocabs + event2idx = {tok : int(idx) for idx, tok in unique_vocabs.items()} + return idx2event, event2idx + + # Augments the pitch vocabulary by expanding the range of pitch values. + def _augment_pitch_vocab(self, unique_vocabs): + pitch_vocab = [x for x in unique_vocabs if 'Note_Pitch_' in x] + pitch_int = [int(x.replace('Note_Pitch_', '')) for x in pitch_vocab if x.replace('Note_Pitch_', '').isdigit()] + min_pitch = min(pitch_int) + max_pitch = max(pitch_int) + min_pitch_margin = max(min_pitch-6, 0) + max_pitch_margin = min(max_pitch+7, 127) + new_pitch_vocab = sorted([f'Note_Pitch_{x}' for x in range(min_pitch_margin, max_pitch_margin+1)], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x))) + new_unique_vocab = [x for x in unique_vocabs if x not in new_pitch_vocab] + new_pitch_vocab + return new_unique_vocab + + # Orders and arranges the instrument vocabulary. + def _arange_instrument_vocab(self, unique_vocabs): + instrument_vocab = [x for x in unique_vocabs if 'Instrument_' in x] + new_instrument_vocab = sorted(instrument_vocab, key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x))) + new_unique_vocab = [x for x in unique_vocabs if x not in new_instrument_vocab] + new_instrument_vocab + return new_unique_vocab + + # Orders and arranges the chord vocabulary, ensuring 'Chord_N_N' is the last token. + def _arange_chord_vocab(self, unique_vocabs): + ''' + for chord augmentation + Chord_N_N should be the last token in the list for an easy implementation of chord augmentation + ''' + chord_vocab = [x for x in unique_vocabs if 'Chord_' in x] + chord_vocab.remove('Chord_N_N') + new_chord_vocab = sorted(chord_vocab, key=lambda x: (not isinstance(x, int), x.split('_')[-1] if isinstance(x, str) else x, x.split('_')[1] if isinstance(x, str) else x)) + new_chord_vocab.append('Chord_N_N') + new_unique_vocab = [x for x in unique_vocabs if x not in new_chord_vocab] + new_chord_vocab + return new_unique_vocab + + # Orders and arranges the beat vocabulary. + def _arange_beat_vocab(self, unique_vocabs): + beat_vocab = [x for x in unique_vocabs if 'Beat_' in x] + new_beat_vocab = sorted(beat_vocab, key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x))) + count = 0 + for idx, token in enumerate(unique_vocabs): + if 'Beat_' in token: + unique_vocabs[idx] = new_beat_vocab[count] + count += 1 + return unique_vocabs + + # Generates masks for the 'remi' encoding scheme. + def _make_mask(self): + ''' + This function is used to extract the target musical features for validation. + ''' + idx2feature = {} + for idx, feature in self.idx2event.items(): + if feature.startswith('SOS') or feature.startswith('EOS') or feature.startswith('Bar'): + idx2feature[idx] = 'type' + elif feature.startswith('Beat'): + idx2feature[idx] = 'beat' + elif feature.startswith('Chord'): + idx2feature[idx] = 'chord' + elif feature.startswith('Tempo'): + idx2feature[idx] = 'tempo' + elif feature.startswith('Note_Pitch'): + idx2feature[idx] = 'pitch' + elif feature.startswith('Note_Duration'): + idx2feature[idx] = 'duration' + elif feature.startswith('Note_Velocity'): + idx2feature[idx] = 'velocity' + elif feature.startswith('Instrument'): + idx2feature[idx] = 'instrument' + + self.total_mask = {} + self.remi_vocab_boundaries_by_key = {} + for target in self.feature_list: + mask = [0] * len(idx2feature) # Initialize all-zero list of length equal to dictionary + for key, value in idx2feature.items(): + if value == target: + mask[int(key)] = 1 # If value equals target, set corresponding position in mask to 1 + mask = torch.LongTensor(mask) + self.total_mask[target] = mask + start_idx, end_idx = torch.argwhere(mask == 1).flatten().tolist()[0], torch.argwhere(mask == 1).flatten().tolist()[-1] + self.remi_vocab_boundaries_by_key[target] = (start_idx, end_idx+1) + + def decode(self, events:torch.Tensor): + ''' + Used for checking events in the evaluation + events: 1d tensor + ''' + decoded_list = [] + for event in events: + decoded_list.append(self.idx2event[event.item()]) + return decoded_list + + def __call__(self, word): + ''' + for remi style encoding + ''' + return self.event2idx[f"{word['name']}_{word['value']}"] + +class MusicTokenVocabCP(LangTokenVocab): + def __init__( + self, + in_vocab_file_path:Union[Path, None], + event_data: list, + encoding_scheme: str, + num_features: int + ): + # Initialize the vocabulary class with vocab file path, event data, encoding scheme, and feature count + super().__init__(in_vocab_file_path, event_data, encoding_scheme, num_features) + + def _augment_pitch_vocab(self, unique_vocabs): + # Extract pitch-related vocabularies and adjust pitch range + pitch_total_vocab = unique_vocabs['pitch'] + pitch_vocab = [x for x in pitch_total_vocab if 'Note_Pitch_' in str(x)] + pitch_int = [int(x.replace('Note_Pitch_', '')) for x in pitch_vocab if x.replace('Note_Pitch_', '').isdigit()] + # Determine the min and max pitch values and extend the pitch range slightly + min_pitch = min(pitch_int) + max_pitch = max(pitch_int) + min_pitch_margin = max(min_pitch - 6, 0) + max_pitch_margin = min(max_pitch + 7, 127) + # Create new pitch vocab and ensure new entries do not overlap with existing ones + new_pitch_vocab = [f'Note_Pitch_{x}' for x in range(min_pitch_margin, max_pitch_margin + 1)] + new_pitch_vocab = [x for x in pitch_total_vocab if str(x) not in new_pitch_vocab] + new_pitch_vocab + unique_vocabs['pitch'] = new_pitch_vocab + return unique_vocabs + + def _mp_get_unique_vocab(self, tune, features): + # Read event data from a file and collect unique vocabularies for specified features + with open(tune, 'rb') as f: + events_list = pickle.load(f) + unique_vocabs = defaultdict(set) + for event in events_list: + for key in features: + unique_vocabs[key].add(event[key]) + return unique_vocabs + + def _get_chord_vocab(self): + ''' + Manually define the chord vocabulary by combining roots and qualities + from a predefined list. This is used for chord augmentation. + ''' + root_list = ['A', 'A#', 'B', 'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#'] + quality_list = ['+', '/o7', '7', 'M', 'M7', 'm', 'm7', 'o', 'o7', 'sus2', 'sus4'] + chord_vocab = [f'Chord_{root}_{quality}' for root in root_list for quality in quality_list] + # Sort the chord vocabulary based on the root and quality + chord_vocab = sorted(chord_vocab, key=lambda x: (not isinstance(x, int), x.split('_')[-1] if isinstance(x, str) else x, x.split('_')[0] if isinstance(x, str) else x)) + return chord_vocab + + def _cp_sort_type(self, unique_vocabs): + # Similar to _nb_sort_type but used for the 'cp' encoding scheme, sorting vocabularies in a different order + unique_vocabs.remove('SOS') + unique_vocabs.remove('EOS') + unique_vocabs.remove('Metrical') + unique_vocabs.remove('Note') + vocab_list = list(unique_vocabs) + unique_vocabs = sorted(vocab_list, key=sort_key) + unique_vocabs.insert(0, 'SOS') + unique_vocabs.insert(1, 'EOS') + unique_vocabs.insert(2, 'Metrical') + unique_vocabs.insert(3, 'Note') + return unique_vocabs + + # Define custom sorting function + def sort_type_cp(self, item): + if item == 0: + return (0, 0) # Move 0 to the beginning + elif isinstance(item, str): + if item.startswith("Bar"): + return (1, item) # "Bar" items come next, sorted lexicographically + elif item.startswith("Beat"): + # Extract numeric part of "Beat_x" to sort numerically + beat_number = int(item.split('_')[1]) + return (2, beat_number) # "Beat" items come last, sorted by number + return (3, item) # Catch-all for anything unexpected (shouldn't be necessary here) + + def _get_vocab(self, event_data, unique_vocabs=None): + if event_data is not None: + # Create vocab mappings (event2idx, idx2event) from the provided event data + print('start to get unique vocab') + event2idx = {} + idx2event = {} + unique_vocabs = defaultdict(set) + # Use multiprocessing to extract unique vocabularies for each event + with Pool(16) as p: + results = p.starmap(self._mp_get_unique_vocab, tqdm([(tune, self.feature_list) for tune in event_data])) + # Combine results from different processes + for result in results: + for key in self.feature_list: + if key == 'chord': # Chords are handled separately + continue + unique_vocabs[key].update(result[key]) + # Augment pitch vocab and add manually defined chord vocab + unique_vocabs = self._augment_pitch_vocab(unique_vocabs) + unique_vocabs['chord'] = self._get_chord_vocab() + # Process each feature type, handling special cases like 'tempo' and 'chord' + for key in self.feature_list: + if key == 'tempo': + remove_nn_flag = False + if 'Tempo_N_N' in unique_vocabs[key]: + unique_vocabs[key].remove('Tempo_N_N') + remove_nn_flag = True + unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x))) + if remove_nn_flag: + unique_vocabs[key].insert(1, 'Tempo_N_N') + elif key == 'chord': + unique_vocabs[key].insert(0, 0) + unique_vocabs[key].insert(1, 'Chord_N_N') + elif key == 'type': # Sort 'type' vocab depending on the encoding scheme + if self.encoding_scheme == 'cp': + unique_vocabs[key] = self._cp_sort_type(unique_vocabs[key]) + else: # NB encoding scheme + unique_vocabs[key] = self._nb_sort_type(unique_vocabs[key]) + elif key == 'beat' and self.encoding_scheme == 'cp': # Handle 'beat' vocab with 'cp' scheme + # unique_vocabs[key].remove('Bar') + # unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), Fraction(x.split('_')[-1] if isinstance(x, str) else x))) + # unique_vocabs[key].insert(1, 'Bar') + unique_vocabs[key] = sorted(unique_vocabs[key], key = self.sort_type_cp) + elif key == 'beat' and self.encoding_scheme == 'nb': # Handle 'beat' vocab with 'nb' scheme + unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x))) + elif key == 'instrument': # Sort 'instrument' vocab by integer values + unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x))) + else: # Default case: sort by integer values for other keys + unique_vocabs[key] = sorted(unique_vocabs[key], key=lambda x: (not isinstance(x, int), int(x.split('_')[-1] if isinstance(x, str) else x))) + # Create event2idx and idx2event mappings for each feature + event2idx[key] = {tok: int(idx) for idx, tok in enumerate(unique_vocabs[key])} + idx2event[key] = {int(idx): tok for idx, tok in enumerate(unique_vocabs[key])} + return idx2event, event2idx + else: + # If no event data, simply map unique vocab to indexes + event2idx = {} + for key in self.feature_list: + event2idx[key] = {tok: int(idx) for idx, tok in unique_vocabs[key].items()} + return unique_vocabs, event2idx + + def get_vocab_size(self): + # Return the size of the vocabulary for each feature + return {key: len(self.idx2event[key]) for key in self.feature_list} + + def __call__(self, event): + # Convert an event to its corresponding indices + return [self.event2idx[key][event[key]] for key in self.feature_list] + + def decode(self, events:torch.Tensor): + decoded_list = [] + for event in events: + decoded_list.append([self.idx2event[key][event[idx].item()] for idx, key in enumerate(self.feature_list)]) + return decoded_list + +class MusicTokenVocabNB(MusicTokenVocabCP): + def __init__( + self, + in_vocab_file_path:Union[Path, None], + event_data: list, + encoding_scheme: str, + num_features: int + ): + super().__init__(in_vocab_file_path, event_data, encoding_scheme, num_features) + + def _nb_sort_type(self, unique_vocabs): + # Remove special tokens and sort the remaining vocab list, then re-insert the special tokens in order + unique_vocabs.remove('SOS') + unique_vocabs.remove('EOS') + unique_vocabs.remove('Empty_Bar') + unique_vocabs.remove('SSS') + unique_vocabs.remove('SSN') + unique_vocabs.remove('SNN') + vocab_list = list(unique_vocabs) + unique_vocabs = sorted(vocab_list, key=sort_key) + unique_vocabs.insert(0, 'SOS') + unique_vocabs.insert(1, 'EOS') + unique_vocabs.insert(2, 'Empty_Bar') + unique_vocabs.insert(3, 'SSS') + unique_vocabs.insert(4, 'SSN') + unique_vocabs.insert(5, 'SNN') + return unique_vocabs \ No newline at end of file diff --git a/demo/Amadeus_app_CN.py b/demo/Amadeus_app_CN.py new file mode 100644 index 0000000..099d8aa --- /dev/null +++ b/demo/Amadeus_app_CN.py @@ -0,0 +1,223 @@ +from email.mime import audio +import torch +from pathlib import Path +import json +from collections import defaultdict +from omegaconf import OmegaConf, DictConfig +from transformers import T5Tokenizer, T5EncoderModel +import gradio as gr +import os, sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from Amadeus.train_utils import adjust_prediction_order +from Amadeus.evaluation_utils import ( + wandb_style_config_to_omega_config, +) +from Amadeus.symbolic_encoding import decoding_utils +from data_representation import vocab_utils +from Amadeus import model_zoo +from Amadeus.symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor + + +# === 保持原来的工具函数 === +def get_best_ckpt_path_and_config(dir): + if dir is None: + raise ValueError('No such code in wandb_dir') + ckpt_dir = dir / 'files' / 'checkpoints' + + config_path = dir / 'files' / 'config.yaml' + vocab_path = next(ckpt_dir.glob('vocab*')) + + if len(list(ckpt_dir.glob('*last.pt'))) > 0: + last_ckpt_fn = next(ckpt_dir.glob('*last.pt')) + else: + pt_fns = sorted(list(ckpt_dir.glob('*.pt')), key=lambda fn: int(fn.stem.split('_')[0].replace('iter', ''))) + last_ckpt_fn = pt_fns[-1] + + return last_ckpt_fn, config_path, vocab_path + + +def prepare_model_and_dataset_from_config(config: DictConfig, vocab_path: str): + nn_params = config.nn_params + vocab_path = Path(vocab_path) + + encoding_scheme = config.nn_params.encoding_scheme + num_features = config.nn_params.num_features + vocab_name = {'remi': 'LangTokenVocab', 'cp': 'MusicTokenVocabCP', 'nb': 'MusicTokenVocabNB'} + selected_vocab_name = vocab_name[encoding_scheme] + + vocab = getattr(vocab_utils, selected_vocab_name)( + in_vocab_file_path=vocab_path, + event_data=None, + encoding_scheme=encoding_scheme, + num_features=num_features) + + prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params) + + AmadeusModel = getattr(model_zoo, nn_params.model_name)( + vocab=vocab, + input_length=config.train_params.input_length, + prediction_order=prediction_order, + input_embedder_name=nn_params.input_embedder_name, + main_decoder_name=nn_params.main_decoder_name, + sub_decoder_name=nn_params.sub_decoder_name, + sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0, + sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \ + if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False, + dim=nn_params.main_decoder.dim_model, + heads=nn_params.main_decoder.num_head, + depth=nn_params.main_decoder.num_layer, + dropout=nn_params.model_dropout, + ) + return AmadeusModel, vocab + + +def load_resources(wandb_exp_dir, device): + wandb_exp_dir = Path(wandb_exp_dir) + ckpt_path, config_path, vocab_path = get_best_ckpt_path_and_config( + wandb_exp_dir + ) + config = OmegaConf.load(config_path) + config = wandb_style_config_to_omega_config(config) + + ckpt = torch.load(ckpt_path, map_location=device) + model, vocab = prepare_model_and_dataset_from_config(config, vocab_path) + model.load_state_dict(ckpt['model'], strict=False) + model.to(device) + model.eval() + torch.compile(model) + print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad)) + + return config, model, vocab + + +import time + +def generate_with_text_prompt(config, vocab, model, device, prompt, text_encoder_model, + sampling_method='top_p', threshold=0.99, + temperature=1.15, generation_length=1024): + encoding_scheme = config.nn_params.encoding_scheme + tokenizer = T5Tokenizer.from_pretrained(text_encoder_model) + encoder = T5EncoderModel.from_pretrained(text_encoder_model).to(device) + context = tokenizer(prompt, return_tensors='pt', + padding='max_length', truncation=True, max_length=128).to(device) + context = encoder(**context).last_hidden_state + + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + in_beat_resolution = in_beat_resolution_dict.get(config.dataset, 4) + + midi_decoder_dict = {'remi': 'MidiDecoder4REMI', + 'cp': 'MidiDecoder4CP', + 'nb': 'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)( + vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset + ) + + generated_sample = model.generate( + 0, generation_length, condition=None, num_target_measures=None, + sampling_method=sampling_method, threshold=threshold, + temperature=temperature, context=context + ) + if encoding_scheme == 'nb': + generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, config.data_params.first_pred_feature) + + # === 生成带时间戳的文件名 === + timestamp = time.strftime("%Y%m%d_%H%M%S") + Path("outputs").mkdir(exist_ok=True) + output_file = Path("outputs") / f"generated_{timestamp}.mid" + + decoder(generated_sample, output_path=str(output_file)) + return str(output_file) + +# === Gradio Demo === +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +model_id = "models/Amadeus-S" # 模型路径,可以是 Amadeus-S, Amadeus-M, Amadeus-L +# check if model exists +if not Path(model_id).exists(): + # download from huggingface + import os + from huggingface_hub import snapshot_download + + os.makedirs("models", exist_ok=True) + + local_dir = snapshot_download( + repo_id="longyu1315/Amadeus-S", + repo_type="model", + local_dir="models" + ) + + print("模型已下载到:", local_dir) + +config, model, vocab = load_resources(model_id, device) + +# 示例 prompts +examples = { + "prompt1": "A melodic electronic ambient song with a touch of darkness, set in the key of E major and a 4/4 time signature. Tubular bells, electric guitar, synth effects, synth pad, and oboe weave together to create an epic, space-like atmosphere. The tempo is a steady Andante, and the chord progression of A, B, and E forms the harmonic backbone of this captivating piece.", + "prompt2": "A melodic electronic song with a moderate tempo, featuring a blend of drums, piano, brass section, alto saxophone, and synth bass. The piece is set in B minor and follows a chord progression of C#m, B, A, and B. With a duration of 252 seconds, it evokes a dreamy and relaxing atmosphere, perfect for corporate settings.", + "prompt3": " A soothing pop song that evokes feelings of love and relaxation, featuring a gentle blend of piano, flute, violin, and acoustic guitar. Set in the key of C major with a 4/4 time signature, the piece moves at an Andante tempo, creating a meditative and emotional atmosphere. The chord progression of G, C, F, G, and C adds to the song's calming ambiance.", + "prompt4": "A lively and melodic rock song with a touch of pop, featuring pizzicato strings that add a playful and upbeat vibe. The piece is set in A minor and maintains a fast tempo of 148 beats per minute, with a 4/4 time signature. The chord progression of C, G, Fmaj7, C, and G repeats throughout the song, creating a catchy and energetic atmosphere that's perfect for corporate or background music.", +} + +def gradio_generate(prompt, threshold, temperature, length): + if "Amadeus-M" in model_id or "Amadeus-L" in model_id: + encoder_choice ="large" + else: + encoder_choice = "base" + text_encoder_model = 'google/flan-t5-base' if encoder_choice == 'base' else 'google/flan-t5-large' + midi_path = generate_with_text_prompt( + config, + vocab, + model, + device, + prompt, + text_encoder_model, + threshold=threshold, + temperature=temperature, + generation_length=length, + ) + # === 根据 MIDI 文件名生成对应的 WAV 文件名 === + audio_path = midi_path.replace('.mid', '.wav').replace('generated', 'music/generated') + return midi_path, audio_path + +with gr.Blocks() as demo: + gr.Markdown("# 🎵 Amadeus MIDI Generation Demo") + gr.Markdown( + "### 🎵 Prompt 输入指南\n" + "请尽量包含以下要素:\n" + "- 曲风(如 pop, electronic, ambient...)\n" + "- 乐器(如 piano, guitar, drums, strings...)\n" + "- 调式(如 C major, F# minor...)\n" + "- 拍号(如 4/4, 3/4...)\n" + "- 速度(如 120 BPM, Andante, Allegro...)\n" + "- 和弦走向(如 C, G, Am, F...)\n" + "- 情绪(如 happy, relaxing, motivational...)" + "推荐从示例中选择初始 Prompt 进行修改。" + ) + with gr.Row(): + prompt = gr.Textbox(label="输入文本描述 (Prompt)", placeholder="A lively rock and electronic fusion, this song radiates happiness and energy. Distorted guitars, a rock organ, and driving drums propel the melody forward in a fast-paced 4/4 time signature. Set in the key of A major, it features a chord progression of E, D, A/G, E, and D, creating a dynamic and engaging sound that would be right at home in a video game soundtrack.") + with gr.Row(): + threshold = gr.Slider(0.5, 1.0, 0.99, step=0.01, label="阈值") + temperature = gr.Slider(0.5, 3.0, 1.25, step=0.05, label="温度") + length = gr.Slider(256, 3072, 1024, step=128, label="生成长度") + generate_btn = gr.Button("生成 MIDI 🎼") + midi_file = gr.File(label="下载生成的 MIDI 文件") + audio_output = gr.Audio(label="生成的音频预览", type="filepath") + generate_btn.click(fn=gradio_generate, + inputs=[prompt, threshold, temperature, length], + outputs=[midi_file, audio_output]) + gr.Markdown("### 示例 Prompt\n" + "prompt1: A melodic electronic ambient song with a touch of darkness, set in the key of E major and a 4/4 time signature. Tubular bells, electric guitar, synth effects, synth pad, and oboe weave together to create an epic, space-like atmosphere. The tempo is a steady Andante, and the chord progression of A, B, and E forms the harmonic backbone of this captivating piece.\n\n" + "prompt2: A melodic electronic song with a moderate tempo, featuring a blend of drums, piano, brass section, alto saxophone, and synth bass. The piece is set in B minor and follows a chord progression of C#m, B, A, and B. With a duration of 252 seconds, it evokes a dreamy and relaxing atmosphere, perfect for corporate settings.\n\n" + "prompt3: A soothing pop song that evokes feelings of love and relaxation, featuring a gentle blend of piano, flute, violin, and acoustic guitar. Set in the key of C major with a 4/4 time signature, the piece moves at an Andante tempo, creating a meditative and emotional atmosphere. The chord progression of G, C, F, G, and C adds to the song's calming ambiance.\n\n" + "prompt4: A lively and melodic rock song with a touch of pop, featuring pizzicato strings that add a playful and upbeat vibe. The piece is set in A minor and maintains a fast tempo of 148 beats per minute, with a 4/4 time signature. The chord progression of C, G, Fmaj7, C, and G repeats throughout the song, creating a catchy and energetic atmosphere that's perfect for corporate or background music." + ) + + with gr.Row(): + for name, text in examples.items(): + # show text on button click + btn = gr.Button(name) + btn.click(lambda t=text: t, None, prompt) + +if __name__ == "__main__": + demo.launch(server_name="0.0.0.0", server_port=7860, share=True) \ No newline at end of file diff --git a/demo/Amadeus_app_EN.py b/demo/Amadeus_app_EN.py new file mode 100644 index 0000000..e460289 --- /dev/null +++ b/demo/Amadeus_app_EN.py @@ -0,0 +1,222 @@ +from email.mime import audio +import torch +from pathlib import Path +import json +from collections import defaultdict +from omegaconf import OmegaConf, DictConfig +from transformers import T5Tokenizer, T5EncoderModel +import gradio as gr +import os, sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from Amadeus.train_utils import adjust_prediction_order +from Amadeus.evaluation_utils import ( + wandb_style_config_to_omega_config, +) +from Amadeus.symbolic_encoding import decoding_utils +from data_representation import vocab_utils +from Amadeus import model_zoo +from Amadeus.symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor + + +# === Keep original utility functions === +def get_best_ckpt_path_and_config(dir): + if dir is None: + raise ValueError('No such code in wandb_dir') + ckpt_dir = dir / 'files' / 'checkpoints' + + config_path = dir / 'files' / 'config.yaml' + vocab_path = next(ckpt_dir.glob('vocab*')) + + if len(list(ckpt_dir.glob('*last.pt'))) > 0: + last_ckpt_fn = next(ckpt_dir.glob('*last.pt')) + else: + pt_fns = sorted(list(ckpt_dir.glob('*.pt')), key=lambda fn: int(fn.stem.split('_')[0].replace('iter', ''))) + last_ckpt_fn = pt_fns[-1] + + return last_ckpt_fn, config_path, vocab_path + + +def prepare_model_and_dataset_from_config(config: DictConfig, vocab_path: str): + nn_params = config.nn_params + vocab_path = Path(vocab_path) + + encoding_scheme = config.nn_params.encoding_scheme + num_features = config.nn_params.num_features + vocab_name = {'remi': 'LangTokenVocab', 'cp': 'MusicTokenVocabCP', 'nb': 'MusicTokenVocabNB'} + selected_vocab_name = vocab_name[encoding_scheme] + + vocab = getattr(vocab_utils, selected_vocab_name)( + in_vocab_file_path=vocab_path, + event_data=None, + encoding_scheme=encoding_scheme, + num_features=num_features) + + prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params) + + AmadeusModel = getattr(model_zoo, nn_params.model_name)( + vocab=vocab, + input_length=config.train_params.input_length, + prediction_order=prediction_order, + input_embedder_name=nn_params.input_embedder_name, + main_decoder_name=nn_params.main_decoder_name, + sub_decoder_name=nn_params.sub_decoder_name, + sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0, + sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \ + if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False, + dim=nn_params.main_decoder.dim_model, + heads=nn_params.main_decoder.num_head, + depth=nn_params.main_decoder.num_layer, + dropout=nn_params.model_dropout, + ) + return AmadeusModel, vocab + + +def load_resources(wandb_exp_dir, device): + wandb_exp_dir = Path(wandb_exp_dir) + ckpt_path, config_path, vocab_path = get_best_ckpt_path_and_config( + wandb_exp_dir + ) + config = OmegaConf.load(config_path) + config = wandb_style_config_to_omega_config(config) + + ckpt = torch.load(ckpt_path, map_location=device) + model, vocab = prepare_model_and_dataset_from_config(config, vocab_path) + model.load_state_dict(ckpt['model'], strict=False) + model.to(device) + model.eval() + torch.compile(model) + print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad)) + + return config, model, vocab + + +import time + +def generate_with_text_prompt(config, vocab, model, device, prompt, text_encoder_model, + sampling_method='top_p', threshold=0.99, + temperature=1.15, generation_length=1024): + encoding_scheme = config.nn_params.encoding_scheme + tokenizer = T5Tokenizer.from_pretrained(text_encoder_model) + encoder = T5EncoderModel.from_pretrained(text_encoder_model).to(device) + context = tokenizer(prompt, return_tensors='pt', + padding='max_length', truncation=True, max_length=128).to(device) + context = encoder(**context).last_hidden_state + + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + in_beat_resolution = in_beat_resolution_dict.get(config.dataset, 4) + + midi_decoder_dict = {'remi': 'MidiDecoder4REMI', + 'cp': 'MidiDecoder4CP', + 'nb': 'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)( + vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset + ) + + generated_sample = model.generate( + 0, generation_length, condition=None, num_target_measures=None, + sampling_method=sampling_method, threshold=threshold, + temperature=temperature, context=context + ) + if encoding_scheme == 'nb': + generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, config.data_params.first_pred_feature) + + # === Generate filename with timestamp === + timestamp = time.strftime("%Y%m%d_%H%M%S") + Path("outputs").mkdir(exist_ok=True) + output_file = Path("outputs") / f"generated_{timestamp}.mid" + + decoder(generated_sample, output_path=str(output_file)) + return str(output_file) + +# === Gradio Demo === +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +model_id = "models/Amadeus-S" # 模型路径,可以是 Amadeus-S, Amadeus-M, Amadeus-L +# check if model exists +if not Path(model_id).exists(): + # download from huggingface + import os + from huggingface_hub import snapshot_download + + os.makedirs("models", exist_ok=True) + + local_dir = snapshot_download( + repo_id="longyu1315/Amadeus-S", + repo_type="model", + local_dir="models" + ) + + print("模型已下载到:", local_dir) +config, model, vocab = load_resources(model_id, device) + +# Example prompts +examples = { + "prompt1": "A melodic electronic ambient song with a touch of darkness, set in the key of E major and a 4/4 time signature. Tubular bells, electric guitar, synth effects, synth pad, and oboe weave together to create an epic, space-like atmosphere. The tempo is a steady Andante, and the chord progression of A, B, and E forms the harmonic backbone of this captivating piece.", + "prompt2": "A melodic electronic song with a moderate tempo, featuring a blend of drums, piano, brass section, alto saxophone, and synth bass. The piece is set in B minor and follows a chord progression of C#m, B, A, and B. With a duration of 252 seconds, it evokes a dreamy and relaxing atmosphere, perfect for corporate settings.", + "prompt3": " A soothing pop song that evokes feelings of love and relaxation, featuring a gentle blend of piano, flute, violin, and acoustic guitar. Set in the key of C major with a 4/4 time signature, the piece moves at an Andante tempo, creating a meditative and emotional atmosphere. The chord progression of G, C, F, G, and C adds to the song's calming ambiance.", + "prompt4": "A lively and melodic rock song with a touch of pop, featuring pizzicato strings that add a playful and upbeat vibe. The piece is set in A minor and maintains a fast tempo of 148 beats per minute, with a 4/4 time signature. The chord progression of C, G, Fmaj7, C, and G repeats throughout the song, creating a catchy and energetic atmosphere that's perfect for corporate or background music.", +} + +def gradio_generate(prompt, threshold, temperature, length): + if "Amadeus-M" in model_id or "Amadeus-L" in model_id: + encoder_choice ="large" + else: + encoder_choice = "base" + text_encoder_model = 'google/flan-t5-base' if encoder_choice == 'base' else 'google/flan-t5-large' + midi_path = generate_with_text_prompt( + config, + vocab, + model, + device, + prompt, + text_encoder_model, + threshold=threshold, + temperature=temperature, + generation_length=length, + ) + # === Generate corresponding WAV filename === + audio_path = midi_path.replace('.mid', '.wav').replace('generated', 'music/generated') + return midi_path, audio_path + +with gr.Blocks() as demo: + gr.Markdown("# 🎵 Amadeus MIDI Generation Demo") + gr.Markdown( + "### 🎵 Prompt Input Guide\n" + "Please try to include the following elements:\n" + "- Genre (e.g. pop, electronic, ambient...)\n" + "- Instruments (e.g. piano, guitar, drums, strings...)\n" + "- Key (e.g. C major, F# minor...)\n" + "- Time signature (e.g. 4/4, 3/4...)\n" + "- Tempo (e.g. 120 BPM, Andante, Allegro...)\n" + "- Chord progression (e.g. C, G, Am, F...)\n" + "- Mood (e.g. happy, relaxing, motivational...)\n" + "We recommend starting from an example prompt and then modifying it." + ) + with gr.Row(): + prompt = gr.Textbox(label="Text Description (Prompt)", placeholder="A lively rock and electronic fusion, this song radiates happiness and energy. Distorted guitars, a rock organ, and driving drums propel the melody forward in a fast-paced 4/4 time signature. Set in the key of A major, it features a chord progression of E, D, A/G, E, and D, creating a dynamic and engaging sound that would be right at home in a video game soundtrack.") + with gr.Row(): + threshold = gr.Slider(0.5, 1.0, 0.99, step=0.01, label="Threshold") + temperature = gr.Slider(0.5, 3.0, 1.25, step=0.05, label="Temperature") + length = gr.Slider(256, 3072, 1024, step=128, label="Generation Length") + generate_btn = gr.Button("Generate MIDI 🎼") + midi_file = gr.File(label="Download Generated MIDI File") + audio_output = gr.Audio(label="Generated Audio Preview", type="filepath") + generate_btn.click(fn=gradio_generate, + inputs=[prompt, threshold, temperature, length], + outputs=[midi_file, audio_output]) + gr.Markdown("### Example Prompts\n" + "prompt1: A melodic electronic ambient song with a touch of darkness, set in the key of E major and a 4/4 time signature. Tubular bells, electric guitar, synth effects, synth pad, and oboe weave together to create an epic, space-like atmosphere. The tempo is a steady Andante, and the chord progression of A, B, and E forms the harmonic backbone of this captivating piece.\n\n" + "prompt2: A melodic electronic song with a moderate tempo, featuring a blend of drums, piano, brass section, alto saxophone, and synth bass. The piece is set in B minor and follows a chord progression of C#m, B, A, and B. With a duration of 252 seconds, it evokes a dreamy and relaxing atmosphere, perfect for corporate settings.\n\n" + "prompt3: A soothing pop song that evokes feelings of love and relaxation, featuring a gentle blend of piano, flute, violin, and acoustic guitar. Set in the key of C major with a 4/4 time signature, the piece moves at an Andante tempo, creating a meditative and emotional atmosphere. The chord progression of G, C, F, G, and C adds to the song's calming ambiance.\n\n" + "prompt4: A lively and melodic rock song with a touch of pop, featuring pizzicato strings that add a playful and upbeat vibe. The piece is set in A minor and maintains a fast tempo of 148 beats per minute, with a 4/4 time signature. The chord progression of C, G, Fmaj7, C, and G repeats throughout the song, creating a catchy and energetic atmosphere that's perfect for corporate or background music." + ) + + with gr.Row(): + for name, text in examples.items(): + # show text on button click + btn = gr.Button(name) + btn.click(lambda t=text: t, None, prompt) + +if __name__ == "__main__": + demo.launch(server_name="0.0.0.0", server_port=7860) \ No newline at end of file diff --git a/demo/app.py b/demo/app.py new file mode 100644 index 0000000..e69de29 diff --git a/demo/requirements.txt b/demo/requirements.txt new file mode 100644 index 0000000..b2efddd --- /dev/null +++ b/demo/requirements.txt @@ -0,0 +1,9 @@ +transformers +torch +gradio +omegaconf +x_transformers +matplotlib +music21 +muspy +SentencePiece \ No newline at end of file diff --git a/demo/text2midi_app.py b/demo/text2midi_app.py new file mode 100644 index 0000000..e69de29 diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..6c2cd12 --- /dev/null +++ b/environment.yml @@ -0,0 +1,228 @@ +name: Amadeus +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=4.5 + - bzip2=1.0.8 + - ca-certificates=2025.1.31 + - ld_impl_linux-64=2.43 + - libffi=3.4.6 + - libgcc=14.2.0 + - libgcc-ng=14.2.0 + - libgomp=14.2.0 + - liblzma=5.6.4 + - libnsl=2.0.1 + - libsqlite=3.49.1 + - libuuid=2.38.1 + - libxcrypt=4.4.36 + - libzlib=1.3.1 + - ncurses=6.5 + - openssl=3.4.1 + - pip=25.0.1 + - python=3.10.16 + - readline=8.2 + - setuptools=75.8.0 + - tk=8.6.13 + - pip: + - accelerate==1.5.0 + - aiohappyeyeballs==2.4.4 + - aiohttp==3.11.10 + - aiosignal==1.3.1 + - annotated-types==0.7.0 + - anthropic==0.59.0 + - antlr4-python3-runtime==4.9.3 + - anyio==4.9.0 + - async-timeout==5.0.1 + - attrs==24.2.0 + - audioread==3.0.1 + - beartype==0.19.0 + - bidict==0.23.1 + # - blis==1.0.1 + - braceexpand==0.1.7 + - catalogue==2.0.10 + - certifi==2024.8.30 + - cffi==1.17.1 + - chardet==5.2.0 + - charset-normalizer==3.4.0 + - chorder==0.1.4 + - click==8.1.7 + - cloudpathlib==0.20.0 + - coloredlogs==15.0.1 + - colt5-attention==0.11.1 + - confection==0.1.5 + - contourpy==1.3.1 + - cycler==0.12.1 + - cymem==2.0.10 + - datasets==3.1.0 + - decorator==5.2.1 + - deepspeed==0.16.7 + - dill==0.3.8 + - distro==1.9.0 + - docker-pycreds==0.4.0 + - einops==0.8.0 + - einx==0.3.0 + - encodec==0.1.1 + - evaluate==0.4.3 + - exceptiongroup==1.3.0 + - filelock==3.16.1 + - fire==0.7.0 + - fonttools==4.56.0 + - frozendict==2.4.6 + - frozenlist==1.5.0 + - fsspec==2024.9.0 + - ftfy==6.3.1 + - gitdb==4.0.11 + - gitpython==3.1.43 + - h11==0.16.0 + - h5py==3.13.0 + - hf-xet==1.1.4 + - hjson==3.1.0 + - httpcore==1.0.9 + - httpx==0.28.1 + - huggingface-hub==0.33.0 + - humanfriendly==10.0 + - hydra-core==1.3.2 + - hypy-utils==1.0.29 + - idna==3.10 + - iniconfig==2.1.0 + - jinja2==3.1.4 + - jiter==0.10.0 + - joblib==1.4.2 + - jsonlines==4.0.0 + - jsonpickle==4.0.5 + - kiwisolver==1.4.8 + # - laion-clap==1.1.7 + - langcodes==3.5.0 + - langdetect==1.0.9 + - language-data==1.3.0 + - lazy-loader==0.4 + - librosa==0.10.2.post1 + - llvmlite==0.41.1 + - local-attention==1.9.15 + - loguru==0.7.3 + - marisa-trie==1.2.1 + - markdown-it-py==3.0.0 + - markupsafe==3.0.2 + - matplotlib==3.10.1 + - mdurl==0.1.2 + - megabyte-pytorch==0.3.6 + - midi2audio==0.1.1 + - miditok==3.0.3 + - miditoolkit==1.0.1 + - mido==1.3.3 + - more-itertools==10.7.0 + - mpmath==1.3.0 + # - msclap==1.3.3 + - msgpack==1.1.0 + - multidict==6.1.0 + - multiprocess==0.70.16 + - murmurhash==1.0.11 + - music21==9.5.0 + - muspy==0.5.0 + - networkx==3.4.2 + - ninja==1.11.1.3 + - nnaudio==0.3.3 + # - numba==0.58.1 + # - numpy==1.26.4 + - nvidia-cublas-cu12==12.6.4.1 + - nvidia-cuda-cupti-cu12==12.6.80 + - nvidia-cuda-nvrtc-cu12==12.6.77 + - nvidia-cuda-runtime-cu12==12.6.77 + - nvidia-cudnn-cu12==9.5.1.17 + - nvidia-cufft-cu12==11.3.0.4 + - nvidia-cufile-cu12==1.11.1.6 + - nvidia-curand-cu12==10.3.7.77 + - nvidia-cusolver-cu12==11.7.1.2 + - nvidia-cusparse-cu12==12.5.4.2 + - nvidia-cusparselt-cu12==0.6.3 + - nvidia-ml-py==12.570.86 + - nvidia-nccl-cu12==2.26.2 + - nvidia-nvjitlink-cu12==12.6.85 + - nvidia-nvtx-cu12==12.6.77 + - omegaconf==2.3.0 + - packaging==24.2 + - pandas==2.2.3 + - peft==0.14.0 + - pillow==11.1.0 + - platformdirs==4.3.6 + - pluggy==1.6.0 + - pooch==1.8.2 + - preshed==3.0.9 + - pretty-midi==0.2.10 + - progressbar==2.5 + - propcache==0.2.1 + - protobuf==5.29.1 + - psutil==6.1.0 + - py-cpuinfo==9.0.0 + - pyarrow==18.1.0 + - pycparser==2.22 + - pydantic==2.10.3 + - pydantic-core==2.27.1 + - pydub==0.25.1 + - pygments==2.18.0 + - pyparsing==3.2.1 + - pypianoroll==1.0.4 + - pysmartdl==1.3.4 + - pytest==8.4.0 + - python-dateutil==2.9.0.post0 + - pytz==2024.2 + - pyyaml==6.0.2 + - regex==2024.11.6 + - requests==2.32.3 + - resampy==0.4.3 + - rich==13.9.4 + - safetensors==0.4.5 + - scikit-learn==1.6.1 + - scipy==1.15.2 + - seaborn==0.13.2 + - sentencepiece==0.2.0 + - sentry-sdk==2.19.2 + - setproctitle==1.3.4 + - sf2utils==1.0.0 + - shellingham==1.5.4 + - six==1.17.0 + - smart-open==7.0.5 + - smmap==5.0.1 + - sniffio==1.3.1 + - soundfile==0.12.1 + - soxr==0.5.0.post1 + - spacy==3.8.2 + - spacy-legacy==3.0.12 + - spacy-loggers==1.0.5 + - srsly==2.4.8 + - st-moe-pytorch==0.1.8 + - sudachidict-core==20250129 + - sudachipy==0.6.10 + - sympy==1.14.0 + - symusic==0.5.5 + - termcolor==2.5.0 + # - thinc==8.3.2 + - threadpoolctl==3.5.0 + - tokenizers==0.21.0 + - tomli==2.2.1 + - torch==2.7.0 + - torchaudio==2.7.0 + - torchlibrosa==0.1.0 + # - torchvision==0.16.2 + - tqdm==4.67.1 + - transformers==4.52.4 + - triton==3.3.0 + - typer==0.15.1 + - typing-extensions==4.12.2 + - tzdata==2024.2 + - urllib3==2.2.3 + - wandb==0.19.0 + - wasabi==1.1.3 + - wcwidth==0.2.13 + - weasel==0.4.1 + - webcolors==24.11.1 + - webdataset==0.2.111 + - wget==3.2 + - wheel==0.41.3 + - wrapt==1.17.0 + - x-transformers==2.3.1 + - xxhash==3.5.0 + - yarl==1.18.3 +prefix: ~/.conda/envs/Amadeus diff --git a/generate-batch.py b/generate-batch.py new file mode 100644 index 0000000..f43f1c0 --- /dev/null +++ b/generate-batch.py @@ -0,0 +1,336 @@ +import sys +import os +from pathlib import Path +from multiprocessing import Process,set_start_method +import torch +import argparse +from omegaconf import OmegaConf +import json + +from Amadeus.evaluation_utils import ( + wandb_style_config_to_omega_config, + prepare_model_and_dataset_from_config, + get_best_ckpt_path_and_config, + Evaluator +) + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-wandb_exp_dir", + required=True, + type=str, + help="wandb experiment directory", + ) + parser.add_argument( + "-generation_type", + type=str, + choices=('conditioned', 'unconditioned', 'text-conditioned'), + default='unconditioned', + help="generation type", + ) + parser.add_argument( + "-sampling_method", + type=str, + choices=('top_p', 'top_k'), + default='top_p', + help="sampling method", + ) + parser.add_argument( + "-threshold", + type=float, + default=0.99, + help="threshold", + ) + parser.add_argument( + "-temperature", + type=float, + default=1.15, + help="temperature", + ) + parser.add_argument( + "-num_samples", + type=int, + default=30, + help="number of samples to generate", + ) + parser.add_argument( + "-num_target_measure", + type=int, + default=4, + help="number of target measures for conditioned generation", + ) + parser.add_argument( + "-choose_selected_tunes", + action='store_true', + help="generate samples from selected tunes, only for SOD dataset", + ) + parser.add_argument( + "-generate_length", + type=int, + default=1024, + help="length of the generated sequence", + ) + parser.add_argument( + "-num_processes", + type=int, + default=4, + help="number of processes to use", + ) + parser.add_argument( + "-gpu_ids", + type=str, + default="1,2,3,5", + help="comma-separated list of GPU IDs to use (e.g., '0,1,2,3')", + ) + parser.add_argument( + "-prompt", + type=str, + default="With a rhythm of 100 BPM, this classical piece in 1/4 time signature in the key of Eb major creates a classical mood using String Ensemble, Pizzicato Strings, Tremolo Strings, Trumpet, Timpani.", + help="prompt for generation, only used for conditioned generation", + ) + parser.add_argument( + "-prompt_file", + type=str, + default="dataset/midicaps/train.json", + help="file containing prompts for text-conditioned generation", + ) + return parser + +def load_resources(wandb_exp_dir, device): + """Load model and dataset resources for a process""" + wandb_dir = Path('wandb') + ckpt_path, config_path, metadata_path, vocab_path = get_best_ckpt_path_and_config(wandb_dir, wandb_exp_dir) + config = OmegaConf.load(config_path) + config = wandb_style_config_to_omega_config(config) + + # Load checkpoint to specified device + ckpt = torch.load(ckpt_path, map_location=device) + model, test_set, vocab = prepare_model_and_dataset_from_config(config, metadata_path, vocab_path) + model.load_state_dict(ckpt['model'], strict=False) + model.to(device) + model.eval() + torch.compile(model) + print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad)) + + # Prepare dataset for prompts + condition_list = [x[1] for x in test_set.data_list] + dataset_for_prompt = [] + for i in range(len(condition_list)): + condition = test_set.get_segments_with_tune_idx(condition_list[i], 0)[0] + dataset_for_prompt.append((condition, condition_list[i])) + + return config, model, dataset_for_prompt, vocab + +def conditioned_worker(process_idx, gpu_id, args, data_slice): + """Worker process for conditioned generation""" + torch.cuda.set_device(gpu_id) + device = torch.device(f'cuda:{gpu_id}') + + # Load resources with proper device + config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device) + + # Create output directory with process index + base_path = Path('wandb') / args.wandb_exp_dir / \ + f"cond_{args.num_target_measure}m_{args.sampling_method}_t{args.threshold}_temp{args.temperature}" + base_path.mkdir(parents=True, exist_ok=True) + + evaluator = Evaluator(config, model, dataset_for_prompt, vocab, device=device) + + # Process assigned data slice + for idx, (tune_in_idx, tune_name) in enumerate(data_slice): + batch_dir = base_path / f"process_{process_idx}_batch_{idx}" + batch_dir.mkdir(parents=True, exist_ok=True) + evaluator.generate_samples_with_prompt( + batch_dir, + args.num_target_measure, + tune_in_idx, + tune_name, + config.data_params.first_pred_feature, + args.sampling_method, + args.threshold, + args.temperature, + generation_length=args.generate_length + ) + +def unconditioned_worker(process_idx, gpu_id, args, num_samples): + """Worker process for unconditioned generation""" + torch.cuda.set_device(gpu_id) + device = torch.device(f'cuda:{gpu_id}') + + # Load resources with proper device + config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device) + + # Create output directory with process index + base_path = Path('wandb') / args.wandb_exp_dir / \ + f"uncond_{args.sampling_method}_t{args.threshold}_temp{args.temperature}" + base_path.mkdir(parents=True, exist_ok=True) + + evaluator = Evaluator(config, model, dataset_for_prompt, vocab, device=device) + + # Generate assigned number of samples + batch_dir = base_path + evaluator.generate_samples_unconditioned( + batch_dir, + num_samples, + config.data_params.first_pred_feature, + args.sampling_method, + args.threshold, + args.temperature, + generation_length=args.generate_length, + uid=f"{process_idx}" + ) +def text_conditioned_worker(process_idx, gpu_id, args, num_samples, data_slice): + """Worker process for unconditioned generation""" + torch.cuda.set_device(gpu_id) + device = torch.device(f'cuda:{gpu_id}') + + # Load resources with proper device + config, model, dataset_for_prompt, vocab = load_resources(args.wandb_exp_dir, device) + + # Create output directory with process index + base_path = Path('wandb') / args.wandb_exp_dir / \ + f"text_condi_{args.sampling_method}_t{args.threshold}_temp{args.temperature}" + base_path.mkdir(parents=True, exist_ok=True) + + evaluator = Evaluator(config, model, dataset_for_prompt, vocab, device=device) + + # Generate assigned number of samples + batch_dir = base_path + for idx, tune_name in enumerate(data_slice): + print(f"Process {process_idx} generating samples for tune: {tune_name}") + evaluator.generate_samples_with_text_prompt( + batch_dir, + tune_name, + config.data_params.first_pred_feature, + args.sampling_method, + args.threshold, + args.temperature, + generation_length=args.generate_length, + uid=f"{process_idx}" + ) +def main(): + # use spawn method for multiprocessing + set_start_method('spawn', force=True) + args = get_argument_parser().parse_args() + gpu_ids = list(map(int, args.gpu_ids.split(','))) + + # Validate GPU availability + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available") + if len(gpu_ids) == 0: + raise ValueError("At least one GPU must be specified") + + # Validate process count + if args.num_processes < 1: + raise ValueError("Number of processes must be at least 1") + if len(gpu_ids) < args.num_processes: + print(f"Warning: More processes ({args.num_processes}) than GPUs ({len(gpu_ids)}), some GPUs will be shared") + + # Prepare data slices for processes + processes = [] + try: + if args.generation_type == 'conditioned': + # Prepare selected tunes + wandb_dir = Path('wandb') / args.wandb_exp_dir + if not wandb_dir.exists(): + raise FileNotFoundError(f"Experiment {args.wandb_exp_dir} not found") + + # Load test set to get selected tunes (dummy load to get dataset info) + dummy_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + _, test_set, _ = prepare_model_and_dataset_from_config( + wandb_dir / "files" / "config.yaml", + wandb_dir / "files" / "metadata.json", + wandb_dir / "files" / "vocab.json" + ) + + if args.choose_selected_tunes and test_set.dataset == 'SOD': + selected_tunes = ['Requiem_orch', 'magnificat_bwv-243_8_orch', + "Clarinet Concert in A Major: 2nd Movement, Adagio_orch"] + else: + selected_tunes = [name for _, name in test_set.data_list][:args.num_samples] + + # Split selected data across processes + selected_data = [d for d in test_set.data_list if d[1] in selected_tunes] + chunk_size = (len(selected_data) + args.num_processes - 1) // args.num_processes + + for i in range(args.num_processes): + start_idx = i * chunk_size + end_idx = min((i+1)*chunk_size, len(selected_data)) + data_slice = selected_data[start_idx:end_idx] + + if not data_slice: + continue + + gpu_id = gpu_ids[i % len(gpu_ids)] + p = Process( + target=conditioned_worker, + args=(i, gpu_id, args, data_slice) + ) + processes.append(p) + p.start() + + elif args.generation_type == 'unconditioned': + samples_per_proc = args.num_samples // args.num_processes + remainder = args.num_samples % args.num_processes + + for i in range(args.num_processes): + gpu_id = gpu_ids[i % len(gpu_ids)] + samples = samples_per_proc + (1 if i < remainder else 0) + + if samples <= 0: + continue + + p = Process( + target=unconditioned_worker, + args=(i, gpu_id, args, samples) + ) + processes.append(p) + p.start() + elif args.generation_type == 'text-conditioned': + samples_per_proc = args.num_samples // args.num_processes + remainder = args.num_samples % args.num_processes + # Load prompts from file + prompt_name_list = [] + with open(args.prompt_file, 'r') as f: + for line in f: + if not line.strip(): + continue + prompt_data = json.loads(line.strip()) + prompt_text = prompt_data['caption'] + if prompt_data['test_set'] is True: + prompt_name_list.append(prompt_text) + print("length of prompt_name_list:", len(prompt_name_list)) + if len(prompt_name_list) >= args.num_samples: + print(f"Reached the limit of {args.num_samples} prompts.") + break + for i in range(args.num_processes): + gpu_id = gpu_ids[i % len(gpu_ids)] + samples = samples_per_proc + (1 if i < remainder else 0) + + if samples <= 0: + continue + + # Split prompt names across processes + start_idx = i * (len(prompt_name_list) // args.num_processes) + end_idx = (i + 1) * (len(prompt_name_list) // args.num_processes) + data_slice = prompt_name_list[start_idx:end_idx] + + p = Process( + target=text_conditioned_worker, + args=(i, gpu_id, args, samples, data_slice) + ) + processes.append(p) + p.start() + # Wait for all processes to complete + for p in processes: + p.join() + + except Exception as e: + print(f"Error in main process: {str(e)}") + for p in processes: + p.terminate() + raise + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/generate.py b/generate.py new file mode 100644 index 0000000..4922099 --- /dev/null +++ b/generate.py @@ -0,0 +1,210 @@ +import torch +from pathlib import Path +import argparse +import json +from collections import defaultdict +from omegaconf import OmegaConf, DictConfig +from transformers import T5Tokenizer, T5EncoderModel +from Amadeus.train_utils import adjust_prediction_order + +from Amadeus.evaluation_utils import ( + get_dir_from_wandb_by_code, + wandb_style_config_to_omega_config, +) +from Amadeus.symbolic_encoding import decoding_utils, data_utils +from data_representation import vocab_utils +from Amadeus import model_zoo +from Amadeus.symbolic_encoding.compile_utils import reverse_shift_and_pad_for_tensor + + +def get_argument_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-wandb_exp_dir", + required=True, + type=str, + help="wandb experiment directory", + ) + parser.add_argument( + "-prompt", + required=True, + type=str, + help="text prompt for genvidia-smiration", + ) + parser.add_argument( + "-output_dir", + type=str, + default="outputs", + help="directory to save results", + ) + parser.add_argument( + "-sampling_method", + type=str, + choices=('top_p', 'top_k'), + default='top_p', + help="sampling method", + ) + parser.add_argument( + "-threshold", + type=float, + default=0.99, + help="threshold", + ) + parser.add_argument( + "-temperature", + type=float, + default=1.15, + help="temperature", + ) + parser.add_argument( + "-generate_length", + type=int, + default=2048, + help="length of the generated sequence", + ) + parser.add_argument( + "-text_encoder_model", + type=str, + default='google/flan-t5-large', + help="pretrained text encoder model", + ) + return parser + +def get_best_ckpt_path_and_config(dir): + if dir is None: + raise ValueError('No such code in wandb_dir') + ckpt_dir = dir / 'files' / 'checkpoints' + + config_path = dir / 'files' / 'config.yaml' + # print all files in ckpt_dir + vocab_path = next(ckpt_dir.glob('vocab*')) + + # if there is pt file ending with 'last', return it + if len(list(ckpt_dir.glob('*last.pt'))) > 0: + last_ckpt_fn = next(ckpt_dir.glob('*last.pt')) + else: + pt_fns = sorted(list(ckpt_dir.glob('*.pt')), key=lambda fn: int(fn.stem.split('_')[0].replace('iter', ''))) + last_ckpt_fn = pt_fns[-1] + + return last_ckpt_fn, config_path, vocab_path + +def prepare_model_and_dataset_from_config(config: DictConfig, vocab_path:str): + nn_params = config.nn_params + vocab_path = Path(vocab_path) + + # print(config) + encoding_scheme = config.nn_params.encoding_scheme + num_features = config.nn_params.num_features + + # get vocab + vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'} + selected_vocab_name = vocab_name[encoding_scheme] + + vocab = getattr(vocab_utils, selected_vocab_name)( + in_vocab_file_path=vocab_path, + event_data=None, + encoding_scheme=encoding_scheme, + num_features=num_features) + # get proper prediction order according to the encoding scheme and target feature in the config + prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params) + + # Create the Transformer model based on configuration parameters + AmadeusModel = getattr(model_zoo, nn_params.model_name)( + vocab=vocab, + input_length=config.train_params.input_length, + prediction_order=prediction_order, + input_embedder_name=nn_params.input_embedder_name, + main_decoder_name=nn_params.main_decoder_name, + sub_decoder_name=nn_params.sub_decoder_name, + sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0, + sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \ + if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False, + dim=nn_params.main_decoder.dim_model, + heads=nn_params.main_decoder.num_head, + depth=nn_params.main_decoder.num_layer, + dropout=nn_params.model_dropout, + ) + + return AmadeusModel, [], vocab + +def load_resources(dir, device): + """Load model and dataset resources""" + dir = Path(dir) + ckpt_path, config_path, vocab_path = get_best_ckpt_path_and_config( + dir + ) + config = OmegaConf.load(config_path) + config = wandb_style_config_to_omega_config(config) + + ckpt = torch.load(ckpt_path, map_location=device) + model, _, vocab = prepare_model_and_dataset_from_config(config, vocab_path) + model.load_state_dict(ckpt['model'], strict=False) + model.to(device) + model.eval() + torch.compile(model) + print("total parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad)) + + return config, model, vocab + +def generate_with_text_prompt(config, vocab, model, device, prompt, save_dir, + first_pred_feature, sampling_method, threshold, + temperature, generation_length=1024): + encoding_scheme = config.nn_params.encoding_scheme + tokenizer = T5Tokenizer.from_pretrained(config.text_encoder_model) + encoder = T5EncoderModel.from_pretrained(config.text_encoder_model).to(device) + print(f"Using T5EncoderModel for text prompt:\n{prompt}") + context = tokenizer(prompt, return_tensors='pt', + padding='max_length', truncation=True, max_length=128).to(device) + context = encoder(**context).last_hidden_state + + in_beat_resolution_dict = {'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4} + in_beat_resolution = in_beat_resolution_dict.get(config.dataset, 4) + + midi_decoder_dict = {'remi': 'MidiDecoder4REMI', + 'cp': 'MidiDecoder4CP', + 'nb': 'MidiDecoder4NB'} + decoder_name = midi_decoder_dict[encoding_scheme] + decoder = getattr(decoding_utils, decoder_name)( + vocab=vocab, in_beat_resolution=in_beat_resolution, dataset_name=config.dataset + ) + + generated_sample = model.generate( + 0, generation_length, condition=None, num_target_measures=None, + sampling_method=sampling_method, threshold=threshold, + temperature=temperature, context=context + ) + if encoding_scheme == 'nb': + generated_sample = reverse_shift_and_pad_for_tensor(generated_sample, first_pred_feature) + + save_dir.mkdir(parents=True, exist_ok=True) + + output_file = save_dir / f"generated.mid" + decoder(generated_sample, output_path=str(output_file)) + print(f"Generated file saved at: {output_file}") + + +def main(): + args = get_argument_parser().parse_args() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + config, model, vocab = load_resources(args.wandb_exp_dir, device) + + save_dir = Path(args.output_dir) + config.text_encoder_model = args.text_encoder_model + generate_with_text_prompt( + config, + vocab, + model, + device, + args.prompt, + save_dir, + config.data_params.first_pred_feature, + args.sampling_method, + args.threshold, + args.temperature, + generation_length=args.generate_length, + ) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..356930b --- /dev/null +++ b/readme.md @@ -0,0 +1,163 @@ +# 🎵 Amadeus: Autoregressive Model with Bidirectional Attribute Modelling for Symbolic Music + +

+ + HuggingFace + + + arXiv + +

+ +**Amadeus** is a novel **symbolic music (MIDI) generation framework**. We use **autoregressive modeling** for note sequences, **discrete diffusion models** for intra-note attributes, and **representation optimization** to enhance model performance. Compared to current mainstream autoregressive or hierarchical autoregressive models, Amadeus achieves significant improvements in **generation quality, speed, and controllability**. While significantly improving generation quality, we have achieved a speedup of at least **4x** compared to pure autoregressive models. We also support a training-free **fine-grained attribute control** mechanism, which endows Amadeus with maximum flexibility. We will continuously update the **code, models, and datasets**. + + + +*** + +## 🏗️ Model Architecture +

+ Amadeus architecture +

+ + + +*** + +## 📅 Changelog + + + +* 2025-08-28: Released inference code and the **Amadeus-S** model + + + +*** + +## ⚙️ Installation and Usage + +Set up the environment (inference only): + + + +```bash +conda create -n amadeus_slim python=3.10 + +conda activate amadeus_slim + +pip install -r demo/requirements.txt +``` + +First run: + + + +```bash +# Chinese interface + +python demo/Amadeus_app_CN.py + +# English interface + +python demo/Amadeus_app_EN.py +``` + +> Note: +> +> `Amadeus_app_CN.py` +> +> is for the Chinese interface, and +> +> `Amadeus_app_EN.py` +> +> is for the English interface. + +👉 The model will be automatically downloaded to the `models/` folder, which includes a usable **soundfont**. Please modify the path of `DEFAULT_SOUND_FONT` in `Amadeus/symbolic_encoding/``midi2audio.py`. + +Example of command-line generation: + + + +``` +python generate.py -wandb\_exp\_dir models/Amadeus-S -text\_encoder\_model google/flan-t5-base -temperature 2 -prompt "A lively and melodic pop rock song featuring piano, overdriven guitar, electric drum and electric bass, set in a fast 4/4 tempo and the key of C# minor, with a frequently recurring chord progression of D, A, C#m, and F# that evokes a mix of emotion and love." +``` + + + +*** + +## 📂 Repository Structure + + + +``` +Amadeus/ + +├── demo/ # Example scripts and interfaces (CN/EN) + +├── Amadeus/ # Core model and symbolic encoding + +├── assets/ # Architecture diagrams and sample audio files + +├── data\_representation # Data processing + +├── models/ # Downloaded or cached pre-trained models + +└── generate.py # Command-line generation entry point +``` + + + +*** + +## 📊 Evaluation Results + +We evaluated **generation speed, text alignment, and note attribute control accuracy** on the **MidiCaps** dataset. The results are as follows: + + + +| Model | Speed (notes/s) | CLAP ↑ | TBT ↑ | CK ↑ | CTS ↑ | CI ↑ | CMtop3 ↑ | +| -------------- | --------------- | -------- | --------- | --------- | --------- | --------- | --------- | +| Text2Midi | 4.02 | 0.19 | 31.76 | 22.22 | 84.15 | 19.92 | 60.57 | +| MuseCoco | 1.67 | 0.19 | 34.21 | 14.66 | 94.24 | 22.42 | 38.18 | +| T2M-inferalign | 4.02 | 0.20 | 39.32 | 29.80 | 84.32 | 20.13 | 47.74 | +| **Amadeus** | **16.23** | 0.20 | 73.93 | 39.31 | 96.98 | 26.01 | 65.52 | +| **Amadeus-M** | 10.51 | **0.21** | **76.31** | **43.07** | **97.02** | **27.11** | **66.39** | + + + +*** + +## 🤝 Acknowledgements and Contributions + +The development of Amadeus is inspired by the music and AI communities, with the goal of **serving music creators, not replacing them**. + +We welcome developers and researchers to contribute code or provide suggestions — please reach out to us via **Issues** or **Pull Requests**. + +Part of the design of this project references [JudeJiwoo/nmt](https://github.com/JudeJiwoo/nmt), and we would like to express our gratitude here 🙏. + +*** + +## ⚠️ Notes + +The current model is relatively small and may not always generate MIDI that fully matches the description. +You can try **slightly adjusting parameters such as temperature or top-p** to improve the results. + +We will continue to improve the model to provide more stable and higher-quality generation. + +*** + +## 📚 Citation + +If you find Amadeus helpful for your research or create,please cite our paper: + + + +```bibtex +@article{su2025amadeus, + title = {Amadeus: Autoregressive Model with Bidirectional Attribute Modelling for Symbolic Music}, + author = {Su, Hongju and Li, Ke and Yang, Lan and Zhang, Honggang and Song, Yi-Zhe}, + journal = {arXiv preprint arXiv:2508.20665}, + year = {2025} +} + diff --git a/readme_CN.md b/readme_CN.md new file mode 100644 index 0000000..b47fd54 --- /dev/null +++ b/readme_CN.md @@ -0,0 +1,108 @@ +# 🎵 Amadeus: Autoregressive Model with Bidirectional Attribute Modelling for Symbolic Music +

+ + HuggingFace + + + arXiv + +

+ +**Amadeus** 是一种新型的 **符号音乐 (MIDI) 生成框架**,我们使用 **自回归** 建模音符序列,**离散扩散模型** 建模音符内部属性,并通过 **表征优化** 提升模型性能。相较于当前主流的自回归或分层自回归模型,Amadeus 在 **生成质量、速度与可控性** 上均取得了显著进步。在生成质量显著提升的同时,我们实现了至少 **4x** 于纯自自回归模型的速度提升。我们同时还支持一种免训练的 **细粒度属性控制** ,这赋予了Amadeus最大程度的灵活性。我们会持续更新 **代码,模型和数据集** 。 + + +--- +## 🏗️ 模型架构 +

+ Amadeus architecture +

+ +--- + +## 📅 更新日志 +- 2025-08-28:公布推理代码和 **Amadeus-S** 模型 + +--- + +## ⚙️ 安装与使用 +搭建环境(仅推理): +```bash +conda create -n amadeus_slim python=3.10 +conda activate amadeus_slim +pip install -r demo/requirements.txt +``` + +首次运行: +```bash +# 中文界面 +python demo/Amadeus_app_CN.py + +# 英文界面 +python demo/Amadeus_app_EN.py +``` +> 说明:`Amadeus_app_CN.py` 用于中文界面,`Amadeus_app_EN.py` 用于英文界面。 +👉 模型会自动下载到 `models/` 文件夹,包含一个可用的 **soundfont**。请修改 `Amadeus/symbolic_encoding/midi2audio.py` 中的 `DEFAULT_SOUND_FONT` 路径。 + +命令行生成示例: +```bash +python generate.py -wandb_exp_dir models/Amadeus-S -text_encoder_model google/flan-t5-base -temperature 2 -prompt "A lively and melodic pop rock song featuring piano, overdriven guitar, electric drum and electric bass, set in a fast 4/4 tempo and the key of C# minor, with a frequently recurring chord progression of D, A, C#m, and F# that evokes a mix of emotion and love." +``` + +--- + +## 📂 仓库结构 +``` +Amadeus/ +├── demo/ # 示例脚本与界面 (CN/EN) +├── Amadeus/ # 核心模型与符号编码 +├── assets/ # 架构图与示例音频文件 +├── data_representation # 数据处理 +├── models/ # 下载或缓存的预训练模型 +└── generate.py # 命令行生成入口 +``` + +--- + +## 📊 评测结果 +我们在 **MidiCaps** 数据集上评测了 **生成速度、文本对齐度以及音符属性控制精度**。结果如下: + +| Model | Speed (notes/s) | CLAP ↑ | TBT ↑ | CK ↑ | CTS ↑ | CI ↑ | CMtop3 ↑ | +|--------------|-----------------|--------|-------|------|-------|------|---------------------| +| Text2Midi | 4.02 | 0.19 | 31.76 | 22.22 | 84.15 | 19.92 | 60.57 | +| MuseCoco | 1.67 | 0.19 | 34.21 | 14.66 | 94.24 | 22.42 | 38.18 | +| T2M-inferalign | 4.02 | 0.20 | 39.32 | 29.80 | 84.32 | 20.13 | 47.74 | +| **Amadeus** | **16.23** | 0.20 | 73.93 | 39.31 | 96.98 | 26.01 | 65.52 | +| **Amadeus-M**| 10.51 | **0.21** | **76.31** | **43.07** | **97.02** | **27.11** | **66.39** | + + + + + +--- +## 🤝 致谢与贡献 +Amadeus 的研发受到音乐与 AI 社区的启发,旨在 **服务音乐创作者,而非替代他们**。 +我们欢迎开发者和研究人员贡献代码或提出建议 —— 请通过 **Issues** 或 **Pull Requests** 与我们交流。 + +本项目部分设计参考了 [JudeJiwoo/nmt](https://github.com/JudeJiwoo/nmt),在此表示感谢 🙏。 + + +## ⚠️ 注意事项 + +当前模型规模较小,并不总是能生成完全符合描述的 MIDI。 +您可以尝试 **适当调整温度(temperature)、阈值(top-p 等参数)** 来改善结果。 + +我们会持续改进模型,以提供更稳定和高质量的生成体验。--- + +--- + +## 📚 引用 +如果您觉得 Amadeus 对您的研究或创作有帮助,请引用我们的论文: + +```bibtex +@article{su2025amadeus, + title = {Amadeus: Autoregressive Model with Bidirectional Attribute Modelling for Symbolic Music}, + author = {Su, Hongju and Li, Ke and Yang, Lan and Zhang, Honggang and Song, Yi-Zhe}, + journal = {arXiv preprint arXiv:2508.20665}, + year = {2025} +} + diff --git a/run_evaluation.py b/run_evaluation.py new file mode 100644 index 0000000..b7cac2d --- /dev/null +++ b/run_evaluation.py @@ -0,0 +1,46 @@ +import sys +import torch +from pathlib import Path + +from omegaconf import OmegaConf + +from Amadeus.evaluation_utils import Evaluator, wandb_style_config_to_omega_config, prepare_model_and_dataset_from_config, get_best_ckpt_path_and_config + +def main(exp_code): + wandb_dir = Path('wandb') + ckpt_path, config_path, metadata_path, vocab_path = get_best_ckpt_path_and_config(wandb_dir, exp_code) + config = OmegaConf.load(config_path) + config = wandb_style_config_to_omega_config(config) + print(ckpt_path) + + ckpt = torch.load(ckpt_path, map_location='cpu') + model, test_set, vocab = prepare_model_and_dataset_from_config(config, metadata_path=metadata_path, vocab_path=vocab_path) + model.load_state_dict(ckpt['model']) + model = model.eval() + + evaluator = Evaluator(config, model, test_set, vocab, device='cuda', batch_size=21) + + evaluator.get_perplexity() + evaluator.save_results(wandb_dir / exp_code / f'micro_evaluated_perplexity_conti_fixed.pt') + mean_by_class = {} + + for key in evaluator.vocab.feature_list: + # skip type for calculating mean as type or metric token have different meanings across encoding schemes + if key == 'type': + continue + mean_nll = sum(evaluator.loss_by_class[key]) / evaluator.count_by_class[key] + mean_by_class[key] = mean_nll + + # calculate micro average + total_mean_nll = 0 + for key in mean_by_class.keys(): + total_mean_nll += mean_by_class[key] * evaluator.count_by_class[key] + denominator = 0 + for key in mean_by_class.keys(): + denominator += evaluator.count_by_class[key] + total_mean_nll /= denominator + return total_mean_nll + +if __name__ == '__main__': + exp_code = sys.argv[1] + print(main(exp_code)) \ No newline at end of file diff --git a/train_accelerate.py b/train_accelerate.py new file mode 100644 index 0000000..0531d61 --- /dev/null +++ b/train_accelerate.py @@ -0,0 +1,376 @@ +from calendar import c +import os +import copy +from pathlib import Path +from datetime import datetime + +import torch +import torch.multiprocessing as mp +from torch.distributed import init_process_group, destroy_process_group + +from accelerate import Accelerator +from accelerate.utils import set_seed + +import wandb +import hydra +from hydra.core.hydra_config import HydraConfig +from omegaconf import DictConfig, OmegaConf + +# import accelerate +from accelerate import Accelerator +from accelerate.utils import set_seed + +from Amadeus.symbolic_encoding import data_utils, decoding_utils +from Amadeus.symbolic_encoding.data_utils import get_emb_total_size +from Amadeus import model_zoo, trainer_accelerate as trainer +from Amadeus.train_utils import NLLLoss4REMI, NLLLoss4CompoundToken, CosineAnnealingWarmUpRestarts, EncodecFlattenLoss, EncodecMultiClassLoss, CosineLRScheduler, adjust_prediction_order, DiffusionLoss4CompoundToken +from Amadeus.encodec.data_utils import EncodecDataset +from data_representation import vocab_utils +from run_evaluation import main as run_evaluation + +def ddp_setup(rank, world_size, backend='nccl'): + os.environ['MASTER_ADDR'] = 'localhost' + os.environ['MASTER_PORT'] = '12355' + init_process_group(backend, rank=rank, world_size=world_size) + torch.cuda.set_device(rank) + +def generate_experiment_name(config): + # add base hyperparameters to the experiment name + dataset_name = config.dataset + encoding_name = config.nn_params.encoding_scheme + num_features = config.nn_params.num_features + input_embedder_name = config.nn_params.input_embedder_name + sub_decoder_name = config.nn_params.sub_decoder_name + batch_size = config.train_params.batch_size + num_layers = config.nn_params.main_decoder.num_layer + input_length = config.train_params.input_length + first_pred_feature = config.data_params.first_pred_feature + + # Add target hyperparameters to the experiment name + # dropout + main_dropout = config.nn_params.model_dropout + # learning rate + lr_decay_rate = config.train_params.decay_step_rate + + time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + # Combine the information into a single string for the experiment name + # experiment_name = f"{time}_{dataset_name}_{encoding_name}{num_features}_{input_embedder_name}_{sub_decoder_name}_firstpred:{first_pred_feature}_inputlen{input_length}_nlayer{num_layers}_batch{batch_size}\ + # _dropout{main_dropout}_lrdecay{lr_decay_rate}" + experiment_name = f"{time}_{dataset_name}_{encoding_name}{num_features}_{sub_decoder_name}_firstpred:{first_pred_feature}_inputlen{input_length}_nlayer{num_layers}_batch{batch_size}" + return experiment_name + +def setup_log(config): + if config.general.make_log and config.use_ddp == False: + experiment_name = generate_experiment_name(config) + wandb.init( + project="Amadeus", + name=experiment_name, + config=OmegaConf.to_container(config) + ) + # 保存配置到 WANDB 根目录 + config_path = Path(wandb.run.dir) / "config.yaml" + OmegaConf.save(config, config_path) # 关键代码 + + save_dir = Path(wandb.run.dir) / "checkpoints" + save_dir.mkdir(exist_ok=True, parents=True) + else: + now = datetime.now() + save_dir = Path('wandb/debug/checkpoints') / now.strftime('%y-%m-%d_%H-%M-%S') + save_dir.mkdir(exist_ok=True, parents=True) + # 保存配置到调试目录 + config_path = save_dir / "config.yaml" + OmegaConf.save(config, config_path) # 关键代码 + + return str(save_dir) + +# Prepare symbolic dataset and model for training +def preapre_sybmolic(config: DictConfig, save_dir: str, rank: int) -> trainer.LanguageModelTrainer: + # Extract neural network parameters, dataset name, encoding scheme, and number of features from the configuration + nn_params = config.nn_params + dataset_name = config.dataset + encoding_scheme = nn_params.encoding_scheme + num_features = nn_params.num_features + + # get proper prediction order according to the encoding scheme and target feature in the config + prediction_order = adjust_prediction_order(encoding_scheme, num_features, config.data_params.first_pred_feature, nn_params) + + # Prepare paths for input and output vocabulary files + vocab_dir = Path(f'vocab/vocab_{dataset_name}') + in_vocab_file_path = vocab_dir / f'vocab_{dataset_name}_{encoding_scheme}{num_features}.json' + out_vocab_path = Path(save_dir) / f'vocab_{dataset_name}_{encoding_scheme}{num_features}.json' + + # get vocab + vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'} + selected_vocab_name = vocab_name[encoding_scheme] + + vocab = getattr(vocab_utils, selected_vocab_name)( + in_vocab_file_path=in_vocab_file_path, + event_data=None, + encoding_scheme=encoding_scheme, + num_features=num_features) + + if out_vocab_path is not None: + vocab.save_vocab(out_vocab_path) + + # Initialize symbolic dataset based on dataset name and configuration parameters + symbolic_dataset = getattr(data_utils, dataset_name)( + vocab=vocab, + encoding_scheme=encoding_scheme, + num_features=num_features, + debug=config.general.debug, + aug_type=config.data_params.aug_type, + input_length=config.train_params.input_length, + first_pred_feature=config.data_params.first_pred_feature, + caption_path=config.captions_path, + ) + + # Split dataset into training, validation, and test sets + split_ratio = config.data_params.split_ratio + trainset, validset, testset = symbolic_dataset.split_train_valid_test_set( + dataset_name=config.dataset, ratio=split_ratio, seed=42, save_dir=save_dir) + + # Create the Transformer model based on configuration parameters + nested_music_transformer = getattr(model_zoo, nn_params.model_name)( + vocab=symbolic_dataset.vocab, + input_length=config.train_params.input_length, + prediction_order=prediction_order, + input_embedder_name=nn_params.input_embedder_name, + main_decoder_name=nn_params.main_decoder_name, + sub_decoder_name=nn_params.sub_decoder_name, + sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0, + sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \ + if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False, + dim=nn_params.main_decoder.dim_model, + heads=nn_params.main_decoder.num_head, + depth=nn_params.main_decoder.num_layer, + dropout=nn_params.model_dropout, + ) + + # Log the total number of parameters requires grad in the model + total_params = sum(p.numel() for p in nested_music_transformer.parameters() if p.requires_grad) + print(f"Total number of parameters is: {total_params}") + + # # Optionally log the total parameters in Wandb + # if config.general.make_log: + # wandb.log({'model_total_params': total_params}, step=0) + + # Select loss function based on encoding scheme + # You can use focal loss by setting focal_alpha and focal_gamma in the config file + focal_alpha = config.train_params.focal_alpha + focal_gamma = config.train_params.focal_gamma + if encoding_scheme == 'remi': + loss_fn = NLLLoss4REMI(focal_alpha=focal_alpha, focal_gamma=focal_gamma) + elif encoding_scheme in ['cp', 'nb']: + if config.use_diff is False: + loss_fn = NLLLoss4CompoundToken(feature_list=symbolic_dataset.vocab.feature_list, focal_alpha=focal_alpha, focal_gamma=focal_gamma) + else: + loss_fn = DiffusionLoss4CompoundToken(feature_list=symbolic_dataset.vocab.feature_list, focal_alpha=focal_alpha, focal_gamma=focal_gamma) + + # Set optimizer and learning rate scheduler based on the configuration + optimizer = torch.optim.AdamW(nested_music_transformer.parameters(), lr=config.train_params.initial_lr, betas=(0.9, 0.95), eps=1e-08, weight_decay=0.01) + scheduler_dict = {'not-using': None, 'cosineannealingwarmuprestarts': CosineAnnealingWarmUpRestarts, 'cosinelr': CosineLRScheduler} + if scheduler_dict[config.train_params.scheduler] == CosineAnnealingWarmUpRestarts: + scheduler = scheduler_dict[config.train_params.scheduler](optimizer, T_0=config.train_params.num_steps_per_cycle, T_mult=2, eta_min=0, eta_max=config.train_params.max_lr, T_up=config.train_params.warmup_steps, gamma=config.train_params.gamma) + elif scheduler_dict[config.train_params.scheduler] == CosineLRScheduler: + scheduler = scheduler_dict[config.train_params.scheduler](optimizer, total_steps=config.train_params.num_iter * config.train_params.decay_step_rate, warmup_steps=config.train_params.warmup_steps, lr_min_ratio=0.1, cycle_length=1.0) + else: + scheduler = None + + # Define beat resolution and MIDI decoder based on the dataset and encoding scheme + in_beat_resolution_dict = {'BachChorale': 4, 'Pop1k7': 4, 'Pop909': 4, 'SOD': 12, 'LakhClean': 4, 'SymphonyMIDI': 8} + try: + in_beat_resolution = in_beat_resolution_dict[dataset_name] + except KeyError: + in_beat_resolution = 4 + midi_decoder_dict = {'remi':'MidiDecoder4REMI', 'cp':'MidiDecoder4CP', 'nb':'MidiDecoder4NB'} + midi_decoder = getattr(decoding_utils, midi_decoder_dict[encoding_scheme])(vocab=symbolic_dataset.vocab, in_beat_resolution=in_beat_resolution, dataset_name=dataset_name) + + # Select trainer class based on encoding scheme + trainer_option_dict = {'remi': 'LanguageModelTrainer4REMI', 'cp': 'LanguageModelTrainer4CompoundToken', 'nb':'LanguageModelTrainer4CompoundToken'} + trainer_option = trainer_option_dict[encoding_scheme] + sampling_method = None + sampling_threshold = 0.99 + sampling_temperature = 1.0 + + # Initialize and return the training module + training_module = getattr(trainer, trainer_option)( + model=nested_music_transformer, + optimizer=optimizer, + scheduler=scheduler, + loss_fn=loss_fn, + midi_decoder=midi_decoder, + train_set=trainset, + valid_set=validset, + save_dir=save_dir, + vocab=symbolic_dataset.vocab, + use_ddp=config.use_ddp, + use_fp16=config.use_fp16, + world_size=config.train_params.world_size, + batch_size=config.train_params.batch_size, + infer_target_len=symbolic_dataset.mean_len_tunes, + gpu_id=rank, + sampling_method=sampling_method, + sampling_threshold=sampling_threshold, + sampling_temperature=sampling_temperature, + config=config + ) + + return training_module + +# Prepare Encodec dataset and model for training +def prepare_encodec(config, save_dir, rank): + # Setup logging and determine where logs will be saved + save_dir = setup_log(config) + + # Extract neural network (NN) parameters and encoding scheme from config + nn_params = config.nn_params + encoding_scheme = config.data_params.encoding_scheme + + # no change in prediction order for encodec + prediction_order = ['k1', 'k2', 'k3', 'k4'] + + # Create directory for storing vocabulary files, if it doesn't already exist + vocab_dir = Path(f'vocab/vocab_MaestroEncodec') + Path(vocab_dir).mkdir(exist_ok=True, parents=True) + + # Define paths for input and output vocabulary files + in_vocab_file_path = vocab_dir / f'maestro-v3.0.0-in_vocab.json' + out_vocab_path = Path(save_dir) / f'maestro-v3.0.0-in_vocab.json' + + # Define path for tokenized dataset using the Encodec scheme + token_path = Path(f"dataset/encodec_dataset/maestro-v3.0.0-encodec_{config.data_type}") + + # Initialize the EncodecDataset object with necessary file paths and parameters + encodec_dataset = EncodecDataset( + in_vocab_file_path=in_vocab_file_path, + out_vocab_path=out_vocab_path, + encoding_scheme=encoding_scheme, + input_length=config.train_params.input_length, + token_path=token_path + ) + + # Split the dataset into training, validation, and test sets + trainset, validset, testset = encodec_dataset.split_train_valid_test_set() + + # Load the model from the model zoo based on the configuration and neural network parameters + nested_music_transformer = getattr(model_zoo, nn_params.model_name)( + vocab=encodec_dataset.vocab, # Vocab used by the dataset + input_length=config.train_params.input_length, # Length of input sequences + prediction_order=prediction_order, # Order in which predictions are made + input_embedder_name=nn_params.input_embedder_name, # Name of the embedding layer + main_decoder_name=nn_params.main_decoder_name, # Main decoder name + sub_decoder_name=nn_params.sub_decoder_name, # Sub-decoder name if applicable + sub_decoder_depth=nn_params.sub_decoder.num_layer if hasattr(nn_params, 'sub_decoder') else 0, # Sub-decoder depth if defined + sub_decoder_enricher_use=nn_params.sub_decoder.feature_enricher_use \ + if hasattr(nn_params, 'sub_decoder') and hasattr(nn_params.sub_decoder, 'feature_enricher_use') else False, # Use feature enricher in sub-decoder if defined + dim=nn_params.main_decoder.dim_model, # Model dimension + heads=nn_params.main_decoder.num_head, # Number of attention heads + depth=nn_params.main_decoder.num_layer, # Number of layers in the main decoder + dropout=nn_params.main_decoder.dropout, # Dropout rate + ) + + # Calculate and print the total number of model parameters + total_params = sum(p.numel() for p in nested_music_transformer.parameters()) + print(f"Total number of parameters is: {total_params}") + + # If logging is enabled, log the total parameter count to Weights and Biases + if config.general.make_log: + wandb.log({'model_total_params': total_params}) + + # Select the appropriate loss function based on the encoding scheme + # In discrete audio token, remi encoding means flatten encoding and nb encoding means compound encoding + # nb_delay encoding is the tokenization manipulation technique proposed in MusicGen "https://arxiv.org/abs/2306.05284" + if encoding_scheme == 'remi': + loss_fn = EncodecFlattenLoss(feature_list=encodec_dataset.vocab.feature_list) # Loss for REMI encoding scheme + elif encoding_scheme == 'nb' or encoding_scheme == 'nb_delay': + loss_fn = EncodecMultiClassLoss(feature_list=encodec_dataset.vocab.feature_list) # Loss for NB or NB-Delay encoding scheme + + # Initialize the AdamW optimizer with the model's parameters and specified hyperparameters + optimizer = torch.optim.AdamW(nested_music_transformer.parameters(), lr=config.train_params.initial_lr, betas=(0.9, 0.95), eps=1e-08, weight_decay=0.01) + + # Define scheduler options and initialize based on config + scheduler_dict = {'not-using': None, 'cosineannealingwarmuprestarts': CosineAnnealingWarmUpRestarts, 'cosinelr': CosineLRScheduler} + if scheduler_dict[config.train_params.scheduler] == CosineAnnealingWarmUpRestarts: + # Cosine Annealing with warm restarts + scheduler = scheduler_dict[config.train_params.scheduler](optimizer, T_0=config.train_params.num_steps_per_cycle, T_mult=2, eta_min=0, eta_max=config.train_params.max_lr, T_up=config.train_params.warmup_steps, gamma=config.train_params.gamma) + elif scheduler_dict[config.train_params.scheduler] == CosineLRScheduler: + # Cosine LR Scheduler + scheduler = scheduler_dict[config.train_params.scheduler](optimizer, total_steps=config.train_params.num_iter * config.train_params.decay_step_rate, warmup_steps=config.train_params.warmup_steps, lr_min_ratio=0.1, cycle_length=1.0) + else: + scheduler = None # No scheduler if 'not-using' is selected + + # Define trainer options based on the encoding scheme + trainer_option_dict = {'remi': 'EncodecFlattenTrainer', 'nb':'EncodecMultiClassTrainer', 'nb_delay':'EncodecMultiClassTrainer'} + trainer_option = trainer_option_dict[encoding_scheme] + + # Define the target inference length for different encoding schemes + infer_target_len_dict = {'remi': 6000, 'nb': 1500, 'nb_delay': 1500} + infer_target_len = infer_target_len_dict[encoding_scheme] + + # sampling method and parameters + sampling_method = None + sampling_threshold = 1.0 + sampling_temperature = 1.0 + + # Initialize the appropriate trainer class with the model, optimizer, datasets, and other training parameters + training_module = getattr(trainer, trainer_option)( + model=nested_music_transformer, + optimizer=optimizer, + scheduler=scheduler, + loss_fn=loss_fn, + midi_decoder=None, + train_set=trainset, + valid_set=validset, + save_dir=save_dir, + vocab=encodec_dataset.vocab, + use_ddp=config.use_ddp, + use_fp16=config.use_fp16, + world_size=config.train_params.world_size, + batch_size=config.train_params.batch_size, + infer_target_len=infer_target_len, + gpu_id=rank, + sampling_method=sampling_method, + sampling_threshold=sampling_threshold, + sampling_temperature=sampling_temperature, + config=config + ) + + # Return the initialized training module to be used for training + return training_module + +def run_train_exp(rank, config, world_size:int=1): + # if config.use_ddp: ddp_setup(rank, world_size) + # config = copy.deepcopy(config) + # config.train_params.world_size = world_size + # if rank != 0: + # config.general.make_log = False + # config.general.infer_and_log = False + + save_dir = setup_log(config) + print(f"save_dir: {save_dir}") + if 'encodec' in config.dataset.lower(): + training_module = prepare_encodec(config, save_dir, rank) + else: + training_module = preapre_sybmolic(config, save_dir, rank) + training_module.accelerate_train_by_num_iter(int(config.train_params.num_iter)) + + if not 'encodec' in config.dataset.lower(): + try: + exp_code = [x for x in save_dir.split('/') if 'run-' in x][0] + mean_nll = run_evaluation(exp_code) + wandb.log({'evaluated_mean_nll': mean_nll}) + except Exception as e: + exp_code = "latest-run" + + +@hydra.main(version_base=None, config_path="./Amadeus/symbolic_yamls/", config_name="config-accelerate") +def main(config: DictConfig): + if config.use_ddp: + world_size = torch.cuda.device_count() + run_train_exp(0, config, world_size) + else: + run_train_exp(0, config) # single gpu + +if __name__ == "__main__": + main() +# CUDA_VISIBLE_DEVICES=2,3,4,5 accelerate launch --num_processes 4 --num_machines 1 train_accelerate.py \ No newline at end of file diff --git a/vocab/.DS_Store b/vocab/.DS_Store new file mode 100644 index 0000000..83dbc6a Binary files /dev/null and b/vocab/.DS_Store differ diff --git a/vocab/vocab_FinetuneDataset/vocab_FinetuneDataset_nb8.json b/vocab/vocab_FinetuneDataset/vocab_FinetuneDataset_nb8.json new file mode 100644 index 0000000..9f49c7e --- /dev/null +++ b/vocab/vocab_FinetuneDataset/vocab_FinetuneDataset_nb8.json @@ -0,0 +1,557 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_1", + "3": "Instrument_2", + "4": "Instrument_3", + "5": "Instrument_4", + "6": "Instrument_5", + "7": "Instrument_6", + "8": "Instrument_7", + "9": "Instrument_8", + "10": "Instrument_9", + "11": "Instrument_10", + "12": "Instrument_11", + "13": "Instrument_12", + "14": "Instrument_13", + "15": "Instrument_14", + "16": "Instrument_15", + "17": "Instrument_16", + "18": "Instrument_17", + "19": "Instrument_18", + "20": "Instrument_19", + "21": "Instrument_20", + "22": "Instrument_21", + "23": "Instrument_22", + "24": "Instrument_23", + "25": "Instrument_24", + "26": "Instrument_25", + "27": "Instrument_26", + "28": "Instrument_27", + "29": "Instrument_28", + "30": "Instrument_29", + "31": "Instrument_30", + "32": "Instrument_31", + "33": "Instrument_32", + "34": "Instrument_33", + "35": "Instrument_34", + "36": "Instrument_35", + "37": "Instrument_36", + "38": "Instrument_37", + "39": "Instrument_38", + "40": "Instrument_39", + "41": "Instrument_40", + "42": "Instrument_41", + "43": "Instrument_42", + "44": "Instrument_43", + "45": "Instrument_44", + "46": "Instrument_45", + "47": "Instrument_46", + "48": "Instrument_47", + "49": "Instrument_48", + "50": "Instrument_49", + "51": "Instrument_50", + "52": "Instrument_51", + "53": "Instrument_52", + "54": "Instrument_53", + "55": "Instrument_54", + "56": "Instrument_55", + "57": "Instrument_56", + "58": "Instrument_57", + "59": "Instrument_58", + "60": "Instrument_59", + "61": "Instrument_60", + "62": "Instrument_61", + "63": "Instrument_62", + "64": "Instrument_63", + "65": "Instrument_64", + "66": "Instrument_65", + "67": "Instrument_66", + "68": "Instrument_67", + "69": "Instrument_68", + "70": "Instrument_69", + "71": "Instrument_70", + "72": "Instrument_71", + "73": "Instrument_72", + "74": "Instrument_73", + "75": "Instrument_74", + "76": "Instrument_75", + "77": "Instrument_76", + "78": "Instrument_77", + "79": "Instrument_78", + "80": "Instrument_79", + "81": "Instrument_80", + "82": "Instrument_81", + "83": "Instrument_82", + "84": "Instrument_83", + "85": "Instrument_84", + "86": "Instrument_85", + "87": "Instrument_86", + "88": "Instrument_87", + "89": "Instrument_88", + "90": "Instrument_89", + "91": "Instrument_90", + "92": "Instrument_91", + "93": "Instrument_92", + "94": "Instrument_93", + "95": "Instrument_94", + "96": "Instrument_95", + "97": "Instrument_96", + "98": "Instrument_97", + "99": "Instrument_98", + "100": "Instrument_99", + "101": "Instrument_100", + "102": "Instrument_101", + "103": "Instrument_102", + "104": "Instrument_103", + "105": "Instrument_104", + "106": "Instrument_105", + "107": "Instrument_106", + "108": "Instrument_107", + "109": "Instrument_108", + "110": "Instrument_109", + "111": "Instrument_110", + "112": "Instrument_111", + "113": "Instrument_112", + "114": "Instrument_113", + "115": "Instrument_114", + "116": "Instrument_115", + "117": "Instrument_116", + "118": "Instrument_117", + "119": "Instrument_118", + "120": "Instrument_119", + "121": "Instrument_120", + "122": "Instrument_121", + "123": "Instrument_122", + "124": "Instrument_123", + "125": "Instrument_124", + "126": "Instrument_125", + "127": "Instrument_126", + "128": "Instrument_127" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_LakhALL/vocab_LakhALL_nb8.json b/vocab/vocab_LakhALL/vocab_LakhALL_nb8.json new file mode 100644 index 0000000..6b4d0c3 --- /dev/null +++ b/vocab/vocab_LakhALL/vocab_LakhALL_nb8.json @@ -0,0 +1,494 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_10", + "8": "Instrument_11", + "9": "Instrument_12", + "10": "Instrument_13", + "11": "Instrument_14", + "12": "Instrument_15", + "13": "Instrument_16", + "14": "Instrument_19", + "15": "Instrument_21", + "16": "Instrument_22", + "17": "Instrument_23", + "18": "Instrument_24", + "19": "Instrument_25", + "20": "Instrument_26", + "21": "Instrument_32", + "22": "Instrument_33", + "23": "Instrument_36", + "24": "Instrument_38", + "25": "Instrument_40", + "26": "Instrument_41", + "27": "Instrument_42", + "28": "Instrument_43", + "29": "Instrument_46", + "30": "Instrument_47", + "31": "Instrument_49", + "32": "Instrument_50", + "33": "Instrument_52", + "34": "Instrument_55", + "35": "Instrument_56", + "36": "Instrument_57", + "37": "Instrument_58", + "38": "Instrument_60", + "39": "Instrument_61", + "40": "Instrument_62", + "41": "Instrument_64", + "42": "Instrument_65", + "43": "Instrument_66", + "44": "Instrument_67", + "45": "Instrument_68", + "46": "Instrument_69", + "47": "Instrument_70", + "48": "Instrument_71", + "49": "Instrument_72", + "50": "Instrument_73", + "51": "Instrument_74", + "52": "Instrument_75", + "53": "Instrument_79", + "54": "Instrument_80", + "55": "Instrument_88", + "56": "Instrument_104", + "57": "Instrument_105", + "58": "Instrument_106", + "59": "Instrument_107", + "60": "Instrument_108", + "61": "Instrument_109", + "62": "Instrument_111", + "63": "Instrument_114", + "64": "Instrument_117", + "65": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_LakhALLFined/vocab_LakhALLFined_nb8.json b/vocab/vocab_LakhALLFined/vocab_LakhALLFined_nb8.json new file mode 100644 index 0000000..9f49c7e --- /dev/null +++ b/vocab/vocab_LakhALLFined/vocab_LakhALLFined_nb8.json @@ -0,0 +1,557 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_1", + "3": "Instrument_2", + "4": "Instrument_3", + "5": "Instrument_4", + "6": "Instrument_5", + "7": "Instrument_6", + "8": "Instrument_7", + "9": "Instrument_8", + "10": "Instrument_9", + "11": "Instrument_10", + "12": "Instrument_11", + "13": "Instrument_12", + "14": "Instrument_13", + "15": "Instrument_14", + "16": "Instrument_15", + "17": "Instrument_16", + "18": "Instrument_17", + "19": "Instrument_18", + "20": "Instrument_19", + "21": "Instrument_20", + "22": "Instrument_21", + "23": "Instrument_22", + "24": "Instrument_23", + "25": "Instrument_24", + "26": "Instrument_25", + "27": "Instrument_26", + "28": "Instrument_27", + "29": "Instrument_28", + "30": "Instrument_29", + "31": "Instrument_30", + "32": "Instrument_31", + "33": "Instrument_32", + "34": "Instrument_33", + "35": "Instrument_34", + "36": "Instrument_35", + "37": "Instrument_36", + "38": "Instrument_37", + "39": "Instrument_38", + "40": "Instrument_39", + "41": "Instrument_40", + "42": "Instrument_41", + "43": "Instrument_42", + "44": "Instrument_43", + "45": "Instrument_44", + "46": "Instrument_45", + "47": "Instrument_46", + "48": "Instrument_47", + "49": "Instrument_48", + "50": "Instrument_49", + "51": "Instrument_50", + "52": "Instrument_51", + "53": "Instrument_52", + "54": "Instrument_53", + "55": "Instrument_54", + "56": "Instrument_55", + "57": "Instrument_56", + "58": "Instrument_57", + "59": "Instrument_58", + "60": "Instrument_59", + "61": "Instrument_60", + "62": "Instrument_61", + "63": "Instrument_62", + "64": "Instrument_63", + "65": "Instrument_64", + "66": "Instrument_65", + "67": "Instrument_66", + "68": "Instrument_67", + "69": "Instrument_68", + "70": "Instrument_69", + "71": "Instrument_70", + "72": "Instrument_71", + "73": "Instrument_72", + "74": "Instrument_73", + "75": "Instrument_74", + "76": "Instrument_75", + "77": "Instrument_76", + "78": "Instrument_77", + "79": "Instrument_78", + "80": "Instrument_79", + "81": "Instrument_80", + "82": "Instrument_81", + "83": "Instrument_82", + "84": "Instrument_83", + "85": "Instrument_84", + "86": "Instrument_85", + "87": "Instrument_86", + "88": "Instrument_87", + "89": "Instrument_88", + "90": "Instrument_89", + "91": "Instrument_90", + "92": "Instrument_91", + "93": "Instrument_92", + "94": "Instrument_93", + "95": "Instrument_94", + "96": "Instrument_95", + "97": "Instrument_96", + "98": "Instrument_97", + "99": "Instrument_98", + "100": "Instrument_99", + "101": "Instrument_100", + "102": "Instrument_101", + "103": "Instrument_102", + "104": "Instrument_103", + "105": "Instrument_104", + "106": "Instrument_105", + "107": "Instrument_106", + "108": "Instrument_107", + "109": "Instrument_108", + "110": "Instrument_109", + "111": "Instrument_110", + "112": "Instrument_111", + "113": "Instrument_112", + "114": "Instrument_113", + "115": "Instrument_114", + "116": "Instrument_115", + "117": "Instrument_116", + "118": "Instrument_117", + "119": "Instrument_118", + "120": "Instrument_119", + "121": "Instrument_120", + "122": "Instrument_121", + "123": "Instrument_122", + "124": "Instrument_123", + "125": "Instrument_124", + "126": "Instrument_125", + "127": "Instrument_126", + "128": "Instrument_127" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_LakhClean/vocab_LakhClean_nb5.json b/vocab/vocab_LakhClean/vocab_LakhClean_nb5.json new file mode 100644 index 0000000..9534353 --- /dev/null +++ b/vocab/vocab_LakhClean/vocab_LakhClean_nb5.json @@ -0,0 +1,278 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_10", + "8": "Instrument_11", + "9": "Instrument_12", + "10": "Instrument_13", + "11": "Instrument_14", + "12": "Instrument_15", + "13": "Instrument_16", + "14": "Instrument_19", + "15": "Instrument_21", + "16": "Instrument_22", + "17": "Instrument_23", + "18": "Instrument_24", + "19": "Instrument_25", + "20": "Instrument_26", + "21": "Instrument_32", + "22": "Instrument_33", + "23": "Instrument_36", + "24": "Instrument_38", + "25": "Instrument_40", + "26": "Instrument_41", + "27": "Instrument_42", + "28": "Instrument_43", + "29": "Instrument_46", + "30": "Instrument_47", + "31": "Instrument_49", + "32": "Instrument_50", + "33": "Instrument_52", + "34": "Instrument_55", + "35": "Instrument_56", + "36": "Instrument_57", + "37": "Instrument_58", + "38": "Instrument_60", + "39": "Instrument_61", + "40": "Instrument_62", + "41": "Instrument_64", + "42": "Instrument_65", + "43": "Instrument_66", + "44": "Instrument_67", + "45": "Instrument_68", + "46": "Instrument_69", + "47": "Instrument_70", + "48": "Instrument_71", + "49": "Instrument_72", + "50": "Instrument_73", + "51": "Instrument_74", + "52": "Instrument_75", + "53": "Instrument_79", + "54": "Instrument_80", + "55": "Instrument_88", + "56": "Instrument_104", + "57": "Instrument_105", + "58": "Instrument_106", + "59": "Instrument_107", + "60": "Instrument_108", + "61": "Instrument_109", + "62": "Instrument_111", + "63": "Instrument_114", + "64": "Instrument_117", + "65": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + } +} \ No newline at end of file diff --git a/vocab/vocab_LakhClean/vocab_LakhClean_nb8.json b/vocab/vocab_LakhClean/vocab_LakhClean_nb8.json new file mode 100644 index 0000000..68b07cc --- /dev/null +++ b/vocab/vocab_LakhClean/vocab_LakhClean_nb8.json @@ -0,0 +1,488 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_30", + "2": "Tempo_31", + "3": "Tempo_32", + "4": "Tempo_33", + "5": "Tempo_34", + "6": "Tempo_35", + "7": "Tempo_36", + "8": "Tempo_37", + "9": "Tempo_38", + "10": "Tempo_40", + "11": "Tempo_42", + "12": "Tempo_44", + "13": "Tempo_46", + "14": "Tempo_48", + "15": "Tempo_50", + "16": "Tempo_52", + "17": "Tempo_54", + "18": "Tempo_56", + "19": "Tempo_58", + "20": "Tempo_60", + "21": "Tempo_62", + "22": "Tempo_64", + "23": "Tempo_67", + "24": "Tempo_70", + "25": "Tempo_73", + "26": "Tempo_76", + "27": "Tempo_79", + "28": "Tempo_82", + "29": "Tempo_85", + "30": "Tempo_88", + "31": "Tempo_92", + "32": "Tempo_96", + "33": "Tempo_100", + "34": "Tempo_104", + "35": "Tempo_108", + "36": "Tempo_112", + "37": "Tempo_116", + "38": "Tempo_121", + "39": "Tempo_126", + "40": "Tempo_131", + "41": "Tempo_136", + "42": "Tempo_141", + "43": "Tempo_147", + "44": "Tempo_153", + "45": "Tempo_159", + "46": "Tempo_165", + "47": "Tempo_172", + "48": "Tempo_179", + "49": "Tempo_186", + "50": "Tempo_193", + "51": "Tempo_201", + "52": "Tempo_209", + "53": "Tempo_217", + "54": "Tempo_226", + "55": "Tempo_235", + "56": "Tempo_244", + "57": "Tempo_254", + "58": "Tempo_264", + "59": "Tempo_275", + "60": "Tempo_286", + "61": "Tempo_297", + "62": "Tempo_309", + "63": "Tempo_347", + "64": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_10", + "8": "Instrument_11", + "9": "Instrument_12", + "10": "Instrument_13", + "11": "Instrument_14", + "12": "Instrument_15", + "13": "Instrument_16", + "14": "Instrument_19", + "15": "Instrument_21", + "16": "Instrument_22", + "17": "Instrument_23", + "18": "Instrument_24", + "19": "Instrument_25", + "20": "Instrument_26", + "21": "Instrument_32", + "22": "Instrument_33", + "23": "Instrument_36", + "24": "Instrument_38", + "25": "Instrument_40", + "26": "Instrument_41", + "27": "Instrument_42", + "28": "Instrument_43", + "29": "Instrument_46", + "30": "Instrument_47", + "31": "Instrument_49", + "32": "Instrument_50", + "33": "Instrument_52", + "34": "Instrument_55", + "35": "Instrument_56", + "36": "Instrument_57", + "37": "Instrument_58", + "38": "Instrument_60", + "39": "Instrument_61", + "40": "Instrument_62", + "41": "Instrument_64", + "42": "Instrument_65", + "43": "Instrument_66", + "44": "Instrument_67", + "45": "Instrument_68", + "46": "Instrument_69", + "47": "Instrument_70", + "48": "Instrument_71", + "49": "Instrument_72", + "50": "Instrument_73", + "51": "Instrument_74", + "52": "Instrument_75", + "53": "Instrument_79", + "54": "Instrument_80", + "55": "Instrument_88", + "56": "Instrument_104", + "57": "Instrument_105", + "58": "Instrument_106", + "59": "Instrument_107", + "60": "Instrument_108", + "61": "Instrument_109", + "62": "Instrument_111", + "63": "Instrument_114", + "64": "Instrument_117", + "65": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_LakhClean/vocab_LakhClean_remi8.json b/vocab/vocab_LakhClean/vocab_LakhClean_remi8.json new file mode 100644 index 0000000..2c51382 --- /dev/null +++ b/vocab/vocab_LakhClean/vocab_LakhClean_remi8.json @@ -0,0 +1,462 @@ +{ + "0": "SOS_None", + "1": "EOS_None", + "2": "Bar_None", + "3": "Bar_time_signature_1/2", + "4": "Bar_time_signature_1/4", + "5": "Bar_time_signature_1/8", + "6": "Bar_time_signature_11/8", + "7": "Bar_time_signature_12/8", + "8": "Bar_time_signature_2/2", + "9": "Bar_time_signature_2/4", + "10": "Bar_time_signature_2/8", + "11": "Bar_time_signature_3/2", + "12": "Bar_time_signature_3/4", + "13": "Bar_time_signature_3/8", + "14": "Bar_time_signature_4/2", + "15": "Bar_time_signature_4/4", + "16": "Bar_time_signature_4/8", + "17": "Bar_time_signature_5/4", + "18": "Bar_time_signature_5/8", + "19": "Bar_time_signature_6/4", + "20": "Bar_time_signature_6/8", + "21": "Bar_time_signature_7/4", + "22": "Bar_time_signature_7/8", + "23": "Bar_time_signature_8/4", + "24": "Bar_time_signature_8/8", + "25": "Bar_time_signature_9/8", + "26": "Beat_0", + "27": "Beat_1", + "28": "Beat_2", + "29": "Beat_3", + "30": "Beat_4", + "31": "Beat_5", + "32": "Beat_6", + "33": "Beat_7", + "34": "Beat_8", + "35": "Beat_9", + "36": "Beat_10", + "37": "Beat_11", + "38": "Beat_12", + "39": "Beat_13", + "40": "Beat_14", + "41": "Beat_15", + "42": "Beat_16", + "43": "Beat_17", + "44": "Beat_18", + "45": "Beat_19", + "46": "Beat_20", + "47": "Beat_21", + "48": "Beat_22", + "49": "Beat_23", + "50": "Beat_24", + "51": "Beat_25", + "52": "Beat_26", + "53": "Beat_27", + "54": "Beat_28", + "55": "Beat_29", + "56": "Beat_30", + "57": "Beat_31", + "58": "Note_Duration_1", + "59": "Note_Duration_10", + "60": "Note_Duration_12", + "61": "Note_Duration_16", + "62": "Note_Duration_2", + "63": "Note_Duration_20", + "64": "Note_Duration_24", + "65": "Note_Duration_28", + "66": "Note_Duration_3", + "67": "Note_Duration_32", + "68": "Note_Duration_4", + "69": "Note_Duration_5", + "70": "Note_Duration_6", + "71": "Note_Duration_8", + "72": "Note_Velocity_100", + "73": "Note_Velocity_120", + "74": "Note_Velocity_40", + "75": "Note_Velocity_60", + "76": "Note_Velocity_80", + "77": "Tempo_100", + "78": "Tempo_104", + "79": "Tempo_108", + "80": "Tempo_112", + "81": "Tempo_116", + "82": "Tempo_121", + "83": "Tempo_126", + "84": "Tempo_131", + "85": "Tempo_136", + "86": "Tempo_141", + "87": "Tempo_147", + "88": "Tempo_153", + "89": "Tempo_159", + "90": "Tempo_165", + "91": "Tempo_172", + "92": "Tempo_179", + "93": "Tempo_186", + "94": "Tempo_193", + "95": "Tempo_201", + "96": "Tempo_209", + "97": "Tempo_217", + "98": "Tempo_226", + "99": "Tempo_235", + "100": "Tempo_244", + "101": "Tempo_254", + "102": "Tempo_264", + "103": "Tempo_275", + "104": "Tempo_286", + "105": "Tempo_297", + "106": "Tempo_30", + "107": "Tempo_309", + "108": "Tempo_31", + "109": "Tempo_32", + "110": "Tempo_33", + "111": "Tempo_34", + "112": "Tempo_347", + "113": "Tempo_35", + "114": "Tempo_36", + "115": "Tempo_37", + "116": "Tempo_38", + "117": "Tempo_390", + "118": "Tempo_40", + "119": "Tempo_42", + "120": "Tempo_44", + "121": "Tempo_46", + "122": "Tempo_48", + "123": "Tempo_50", + "124": "Tempo_52", + "125": "Tempo_54", + "126": "Tempo_56", + "127": "Tempo_58", + "128": "Tempo_60", + "129": "Tempo_62", + "130": "Tempo_64", + "131": "Tempo_67", + "132": "Tempo_70", + "133": "Tempo_73", + "134": "Tempo_76", + "135": "Tempo_79", + "136": "Tempo_82", + "137": "Tempo_85", + "138": "Tempo_88", + "139": "Tempo_92", + "140": "Tempo_96", + "141": "Note_Pitch_6", + "142": "Note_Pitch_7", + "143": "Note_Pitch_8", + "144": "Note_Pitch_9", + "145": "Note_Pitch_10", + "146": "Note_Pitch_11", + "147": "Note_Pitch_12", + "148": "Note_Pitch_13", + "149": "Note_Pitch_14", + "150": "Note_Pitch_15", + "151": "Note_Pitch_16", + "152": "Note_Pitch_17", + "153": "Note_Pitch_18", + "154": "Note_Pitch_19", + "155": "Note_Pitch_20", + "156": "Note_Pitch_21", + "157": "Note_Pitch_22", + "158": "Note_Pitch_23", + "159": "Note_Pitch_24", + "160": "Note_Pitch_25", + "161": "Note_Pitch_26", + "162": "Note_Pitch_27", + "163": "Note_Pitch_28", + "164": "Note_Pitch_29", + "165": "Note_Pitch_30", + "166": "Note_Pitch_31", + "167": "Note_Pitch_32", + "168": "Note_Pitch_33", + "169": "Note_Pitch_34", + "170": "Note_Pitch_35", + "171": "Note_Pitch_36", + "172": "Note_Pitch_37", + "173": "Note_Pitch_38", + "174": "Note_Pitch_39", + "175": "Note_Pitch_40", + "176": "Note_Pitch_41", + "177": "Note_Pitch_42", + "178": "Note_Pitch_43", + "179": "Note_Pitch_44", + "180": "Note_Pitch_45", + "181": "Note_Pitch_46", + "182": "Note_Pitch_47", + "183": "Note_Pitch_48", + "184": "Note_Pitch_49", + "185": "Note_Pitch_50", + "186": "Note_Pitch_51", + "187": "Note_Pitch_52", + "188": "Note_Pitch_53", + "189": "Note_Pitch_54", + "190": "Note_Pitch_55", + "191": "Note_Pitch_56", + "192": "Note_Pitch_57", + "193": "Note_Pitch_58", + "194": "Note_Pitch_59", + "195": "Note_Pitch_60", + "196": "Note_Pitch_61", + "197": "Note_Pitch_62", + "198": "Note_Pitch_63", + "199": "Note_Pitch_64", + "200": "Note_Pitch_65", + "201": "Note_Pitch_66", + "202": "Note_Pitch_67", + "203": "Note_Pitch_68", + "204": "Note_Pitch_69", + "205": "Note_Pitch_70", + "206": "Note_Pitch_71", + "207": "Note_Pitch_72", + "208": "Note_Pitch_73", + "209": "Note_Pitch_74", + "210": "Note_Pitch_75", + "211": "Note_Pitch_76", + "212": "Note_Pitch_77", + "213": "Note_Pitch_78", + "214": "Note_Pitch_79", + "215": "Note_Pitch_80", + "216": "Note_Pitch_81", + "217": "Note_Pitch_82", + "218": "Note_Pitch_83", + "219": "Note_Pitch_84", + "220": "Note_Pitch_85", + "221": "Note_Pitch_86", + "222": "Note_Pitch_87", + "223": "Note_Pitch_88", + "224": "Note_Pitch_89", + "225": "Note_Pitch_90", + "226": "Note_Pitch_91", + "227": "Note_Pitch_92", + "228": "Note_Pitch_93", + "229": "Note_Pitch_94", + "230": "Note_Pitch_95", + "231": "Note_Pitch_96", + "232": "Note_Pitch_97", + "233": "Note_Pitch_98", + "234": "Note_Pitch_99", + "235": "Note_Pitch_100", + "236": "Note_Pitch_101", + "237": "Note_Pitch_102", + "238": "Note_Pitch_103", + "239": "Note_Pitch_104", + "240": "Note_Pitch_105", + "241": "Note_Pitch_106", + "242": "Note_Pitch_107", + "243": "Note_Pitch_108", + "244": "Note_Pitch_109", + "245": "Note_Pitch_110", + "246": "Note_Pitch_111", + "247": "Note_Pitch_112", + "248": "Note_Pitch_113", + "249": "Note_Pitch_114", + "250": "Note_Pitch_115", + "251": "Note_Pitch_116", + "252": "Note_Pitch_117", + "253": "Note_Pitch_118", + "254": "Note_Pitch_119", + "255": "Note_Pitch_120", + "256": "Note_Pitch_121", + "257": "Note_Pitch_122", + "258": "Note_Pitch_123", + "259": "Note_Pitch_124", + "260": "Note_Pitch_125", + "261": "Note_Pitch_126", + "262": "Instrument_0", + "263": "Instrument_4", + "264": "Instrument_6", + "265": "Instrument_7", + "266": "Instrument_8", + "267": "Instrument_9", + "268": "Instrument_10", + "269": "Instrument_11", + "270": "Instrument_12", + "271": "Instrument_13", + "272": "Instrument_14", + "273": "Instrument_15", + "274": "Instrument_16", + "275": "Instrument_19", + "276": "Instrument_21", + "277": "Instrument_22", + "278": "Instrument_23", + "279": "Instrument_24", + "280": "Instrument_25", + "281": "Instrument_26", + "282": "Instrument_32", + "283": "Instrument_33", + "284": "Instrument_36", + "285": "Instrument_38", + "286": "Instrument_40", + "287": "Instrument_41", + "288": "Instrument_42", + "289": "Instrument_43", + "290": "Instrument_46", + "291": "Instrument_47", + "292": "Instrument_49", + "293": "Instrument_50", + "294": "Instrument_52", + "295": "Instrument_55", + "296": "Instrument_56", + "297": "Instrument_57", + "298": "Instrument_58", + "299": "Instrument_60", + "300": "Instrument_61", + "301": "Instrument_62", + "302": "Instrument_64", + "303": "Instrument_65", + "304": "Instrument_66", + "305": "Instrument_67", + "306": "Instrument_68", + "307": "Instrument_69", + "308": "Instrument_70", + "309": "Instrument_71", + "310": "Instrument_72", + "311": "Instrument_73", + "312": "Instrument_74", + "313": "Instrument_75", + "314": "Instrument_79", + "315": "Instrument_80", + "316": "Instrument_88", + "317": "Instrument_104", + "318": "Instrument_105", + "319": "Instrument_106", + "320": "Instrument_107", + "321": "Instrument_108", + "322": "Instrument_109", + "323": "Instrument_111", + "324": "Instrument_114", + "325": "Instrument_117", + "326": "Instrument_118", + "327": "Chord_A_+", + "328": "Chord_A#_+", + "329": "Chord_B_+", + "330": "Chord_C_+", + "331": "Chord_C#_+", + "332": "Chord_D_+", + "333": "Chord_D#_+", + "334": "Chord_E_+", + "335": "Chord_F_+", + "336": "Chord_F#_+", + "337": "Chord_G_+", + "338": "Chord_G#_+", + "339": "Chord_A_/o7", + "340": "Chord_A#_/o7", + "341": "Chord_B_/o7", + "342": "Chord_C_/o7", + "343": "Chord_C#_/o7", + "344": "Chord_D_/o7", + "345": "Chord_D#_/o7", + "346": "Chord_E_/o7", + "347": "Chord_F_/o7", + "348": "Chord_F#_/o7", + "349": "Chord_G_/o7", + "350": "Chord_G#_/o7", + "351": "Chord_A_7", + "352": "Chord_A#_7", + "353": "Chord_B_7", + "354": "Chord_C_7", + "355": "Chord_C#_7", + "356": "Chord_D_7", + "357": "Chord_D#_7", + "358": "Chord_E_7", + "359": "Chord_F_7", + "360": "Chord_F#_7", + "361": "Chord_G_7", + "362": "Chord_G#_7", + "363": "Chord_A_M", + "364": "Chord_A#_M", + "365": "Chord_B_M", + "366": "Chord_C_M", + "367": "Chord_C#_M", + "368": "Chord_D_M", + "369": "Chord_D#_M", + "370": "Chord_E_M", + "371": "Chord_F_M", + "372": "Chord_F#_M", + "373": "Chord_G_M", + "374": "Chord_G#_M", + "375": "Chord_A_M7", + "376": "Chord_A#_M7", + "377": "Chord_B_M7", + "378": "Chord_C_M7", + "379": "Chord_C#_M7", + "380": "Chord_D_M7", + "381": "Chord_D#_M7", + "382": "Chord_E_M7", + "383": "Chord_F_M7", + "384": "Chord_F#_M7", + "385": "Chord_G_M7", + "386": "Chord_G#_M7", + "387": "Chord_A_m", + "388": "Chord_A#_m", + "389": "Chord_B_m", + "390": "Chord_C_m", + "391": "Chord_C#_m", + "392": "Chord_D_m", + "393": "Chord_D#_m", + "394": "Chord_E_m", + "395": "Chord_F_m", + "396": "Chord_F#_m", + "397": "Chord_G_m", + "398": "Chord_G#_m", + "399": "Chord_A_m7", + "400": "Chord_A#_m7", + "401": "Chord_B_m7", + "402": "Chord_C_m7", + "403": "Chord_C#_m7", + "404": "Chord_D_m7", + "405": "Chord_D#_m7", + "406": "Chord_E_m7", + "407": "Chord_F_m7", + "408": "Chord_F#_m7", + "409": "Chord_G_m7", + "410": "Chord_G#_m7", + "411": "Chord_A_o", + "412": "Chord_A#_o", + "413": "Chord_B_o", + "414": "Chord_C_o", + "415": "Chord_C#_o", + "416": "Chord_D_o", + "417": "Chord_D#_o", + "418": "Chord_E_o", + "419": "Chord_F_o", + "420": "Chord_F#_o", + "421": "Chord_G_o", + "422": "Chord_G#_o", + "423": "Chord_A_o7", + "424": "Chord_A#_o7", + "425": "Chord_B_o7", + "426": "Chord_C_o7", + "427": "Chord_C#_o7", + "428": "Chord_D_o7", + "429": "Chord_D#_o7", + "430": "Chord_E_o7", + "431": "Chord_F_o7", + "432": "Chord_F#_o7", + "433": "Chord_G_o7", + "434": "Chord_G#_o7", + "435": "Chord_A_sus2", + "436": "Chord_A#_sus2", + "437": "Chord_B_sus2", + "438": "Chord_C_sus2", + "439": "Chord_C#_sus2", + "440": "Chord_D_sus2", + "441": "Chord_D#_sus2", + "442": "Chord_E_sus2", + "443": "Chord_F_sus2", + "444": "Chord_F#_sus2", + "445": "Chord_G_sus2", + "446": "Chord_G#_sus2", + "447": "Chord_A_sus4", + "448": "Chord_A#_sus4", + "449": "Chord_B_sus4", + "450": "Chord_C_sus4", + "451": "Chord_C#_sus4", + "452": "Chord_D_sus4", + "453": "Chord_D#_sus4", + "454": "Chord_E_sus4", + "455": "Chord_F_sus4", + "456": "Chord_F#_sus4", + "457": "Chord_G_sus4", + "458": "Chord_G#_sus4", + "459": "Chord_N_N" +} \ No newline at end of file diff --git a/vocab/vocab_Pop1k7/vocab_Pop1k7_nb8.json b/vocab/vocab_Pop1k7/vocab_Pop1k7_nb8.json new file mode 100644 index 0000000..6b94992 --- /dev/null +++ b/vocab/vocab_Pop1k7/vocab_Pop1k7_nb8.json @@ -0,0 +1,354 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_4/4" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_30", + "2": "Tempo_31", + "3": "Tempo_32", + "4": "Tempo_33", + "5": "Tempo_34", + "6": "Tempo_35", + "7": "Tempo_36", + "8": "Tempo_37", + "9": "Tempo_38", + "10": "Tempo_40", + "11": "Tempo_42", + "12": "Tempo_44", + "13": "Tempo_46", + "14": "Tempo_50", + "15": "Tempo_52", + "16": "Tempo_54", + "17": "Tempo_56", + "18": "Tempo_58", + "19": "Tempo_60", + "20": "Tempo_62", + "21": "Tempo_64", + "22": "Tempo_67", + "23": "Tempo_70", + "24": "Tempo_73", + "25": "Tempo_76", + "26": "Tempo_79", + "27": "Tempo_82", + "28": "Tempo_85", + "29": "Tempo_88", + "30": "Tempo_92", + "31": "Tempo_96", + "32": "Tempo_100", + "33": "Tempo_104", + "34": "Tempo_108", + "35": "Tempo_112", + "36": "Tempo_116", + "37": "Tempo_121", + "38": "Tempo_126", + "39": "Tempo_131", + "40": "Tempo_136", + "41": "Tempo_141", + "42": "Tempo_147", + "43": "Tempo_153", + "44": "Tempo_159", + "45": "Tempo_165", + "46": "Tempo_172", + "47": "Tempo_179", + "48": "Tempo_186", + "49": "Tempo_193", + "50": "Tempo_201", + "51": "Tempo_209", + "52": "Tempo_217", + "53": "Tempo_226", + "54": "Tempo_235" + }, + "instrument": { + "0": 0, + "1": "Instrument_0" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_16", + "2": "Note_Pitch_17", + "3": "Note_Pitch_18", + "4": "Note_Pitch_19", + "5": "Note_Pitch_20", + "6": "Note_Pitch_21", + "7": "Note_Pitch_22", + "8": "Note_Pitch_23", + "9": "Note_Pitch_24", + "10": "Note_Pitch_25", + "11": "Note_Pitch_26", + "12": "Note_Pitch_27", + "13": "Note_Pitch_28", + "14": "Note_Pitch_29", + "15": "Note_Pitch_30", + "16": "Note_Pitch_31", + "17": "Note_Pitch_32", + "18": "Note_Pitch_33", + "19": "Note_Pitch_34", + "20": "Note_Pitch_35", + "21": "Note_Pitch_36", + "22": "Note_Pitch_37", + "23": "Note_Pitch_38", + "24": "Note_Pitch_39", + "25": "Note_Pitch_40", + "26": "Note_Pitch_41", + "27": "Note_Pitch_42", + "28": "Note_Pitch_43", + "29": "Note_Pitch_44", + "30": "Note_Pitch_45", + "31": "Note_Pitch_46", + "32": "Note_Pitch_47", + "33": "Note_Pitch_48", + "34": "Note_Pitch_49", + "35": "Note_Pitch_50", + "36": "Note_Pitch_51", + "37": "Note_Pitch_52", + "38": "Note_Pitch_53", + "39": "Note_Pitch_54", + "40": "Note_Pitch_55", + "41": "Note_Pitch_56", + "42": "Note_Pitch_57", + "43": "Note_Pitch_58", + "44": "Note_Pitch_59", + "45": "Note_Pitch_60", + "46": "Note_Pitch_61", + "47": "Note_Pitch_62", + "48": "Note_Pitch_63", + "49": "Note_Pitch_64", + "50": "Note_Pitch_65", + "51": "Note_Pitch_66", + "52": "Note_Pitch_67", + "53": "Note_Pitch_68", + "54": "Note_Pitch_69", + "55": "Note_Pitch_70", + "56": "Note_Pitch_71", + "57": "Note_Pitch_72", + "58": "Note_Pitch_73", + "59": "Note_Pitch_74", + "60": "Note_Pitch_75", + "61": "Note_Pitch_76", + "62": "Note_Pitch_77", + "63": "Note_Pitch_78", + "64": "Note_Pitch_79", + "65": "Note_Pitch_80", + "66": "Note_Pitch_81", + "67": "Note_Pitch_82", + "68": "Note_Pitch_83", + "69": "Note_Pitch_84", + "70": "Note_Pitch_85", + "71": "Note_Pitch_86", + "72": "Note_Pitch_87", + "73": "Note_Pitch_88", + "74": "Note_Pitch_89", + "75": "Note_Pitch_90", + "76": "Note_Pitch_91", + "77": "Note_Pitch_92", + "78": "Note_Pitch_93", + "79": "Note_Pitch_94", + "80": "Note_Pitch_95", + "81": "Note_Pitch_96", + "82": "Note_Pitch_97", + "83": "Note_Pitch_98", + "84": "Note_Pitch_99", + "85": "Note_Pitch_100", + "86": "Note_Pitch_101", + "87": "Note_Pitch_102", + "88": "Note_Pitch_103", + "89": "Note_Pitch_104", + "90": "Note_Pitch_105", + "91": "Note_Pitch_106", + "92": "Note_Pitch_107", + "93": "Note_Pitch_108", + "94": "Note_Pitch_109", + "95": "Note_Pitch_110", + "96": "Note_Pitch_111", + "97": "Note_Pitch_112" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_48", + "3": "Note_Velocity_56", + "4": "Note_Velocity_64", + "5": "Note_Velocity_72", + "6": "Note_Velocity_80", + "7": "Note_Velocity_88" + } +} \ No newline at end of file diff --git a/vocab/vocab_Pop1k7/vocab_Pop1k7_remi8.json b/vocab/vocab_Pop1k7/vocab_Pop1k7_remi8.json new file mode 100644 index 0000000..89af462 --- /dev/null +++ b/vocab/vocab_Pop1k7/vocab_Pop1k7_remi8.json @@ -0,0 +1,328 @@ +{ + "0": "SOS_None", + "1": "EOS_None", + "2": "Bar_None", + "3": "Bar_time_signature_4/4", + "4": "Beat_0", + "5": "Beat_1", + "6": "Beat_2", + "7": "Beat_3", + "8": "Beat_4", + "9": "Beat_5", + "10": "Beat_6", + "11": "Beat_7", + "12": "Beat_8", + "13": "Beat_9", + "14": "Beat_10", + "15": "Beat_11", + "16": "Beat_12", + "17": "Beat_13", + "18": "Beat_14", + "19": "Beat_15", + "20": "Note_Duration_1", + "21": "Note_Duration_10", + "22": "Note_Duration_12", + "23": "Note_Duration_16", + "24": "Note_Duration_2", + "25": "Note_Duration_20", + "26": "Note_Duration_24", + "27": "Note_Duration_28", + "28": "Note_Duration_3", + "29": "Note_Duration_32", + "30": "Note_Duration_4", + "31": "Note_Duration_5", + "32": "Note_Duration_6", + "33": "Note_Duration_8", + "34": "Note_Velocity_40", + "35": "Note_Velocity_48", + "36": "Note_Velocity_56", + "37": "Note_Velocity_64", + "38": "Note_Velocity_72", + "39": "Note_Velocity_80", + "40": "Note_Velocity_88", + "41": "Tempo_100", + "42": "Tempo_104", + "43": "Tempo_108", + "44": "Tempo_112", + "45": "Tempo_116", + "46": "Tempo_121", + "47": "Tempo_126", + "48": "Tempo_131", + "49": "Tempo_136", + "50": "Tempo_141", + "51": "Tempo_147", + "52": "Tempo_153", + "53": "Tempo_159", + "54": "Tempo_165", + "55": "Tempo_172", + "56": "Tempo_179", + "57": "Tempo_186", + "58": "Tempo_193", + "59": "Tempo_201", + "60": "Tempo_209", + "61": "Tempo_217", + "62": "Tempo_226", + "63": "Tempo_235", + "64": "Tempo_30", + "65": "Tempo_31", + "66": "Tempo_32", + "67": "Tempo_33", + "68": "Tempo_34", + "69": "Tempo_35", + "70": "Tempo_36", + "71": "Tempo_37", + "72": "Tempo_38", + "73": "Tempo_40", + "74": "Tempo_42", + "75": "Tempo_44", + "76": "Tempo_46", + "77": "Tempo_50", + "78": "Tempo_52", + "79": "Tempo_54", + "80": "Tempo_56", + "81": "Tempo_58", + "82": "Tempo_60", + "83": "Tempo_62", + "84": "Tempo_64", + "85": "Tempo_67", + "86": "Tempo_70", + "87": "Tempo_73", + "88": "Tempo_76", + "89": "Tempo_79", + "90": "Tempo_82", + "91": "Tempo_85", + "92": "Tempo_88", + "93": "Tempo_92", + "94": "Tempo_96", + "95": "Note_Pitch_16", + "96": "Note_Pitch_17", + "97": "Note_Pitch_18", + "98": "Note_Pitch_19", + "99": "Note_Pitch_20", + "100": "Note_Pitch_21", + "101": "Note_Pitch_22", + "102": "Note_Pitch_23", + "103": "Note_Pitch_24", + "104": "Note_Pitch_25", + "105": "Note_Pitch_26", + "106": "Note_Pitch_27", + "107": "Note_Pitch_28", + "108": "Note_Pitch_29", + "109": "Note_Pitch_30", + "110": "Note_Pitch_31", + "111": "Note_Pitch_32", + "112": "Note_Pitch_33", + "113": "Note_Pitch_34", + "114": "Note_Pitch_35", + "115": "Note_Pitch_36", + "116": "Note_Pitch_37", + "117": "Note_Pitch_38", + "118": "Note_Pitch_39", + "119": "Note_Pitch_40", + "120": "Note_Pitch_41", + "121": "Note_Pitch_42", + "122": "Note_Pitch_43", + "123": "Note_Pitch_44", + "124": "Note_Pitch_45", + "125": "Note_Pitch_46", + "126": "Note_Pitch_47", + "127": "Note_Pitch_48", + "128": "Note_Pitch_49", + "129": "Note_Pitch_50", + "130": "Note_Pitch_51", + "131": "Note_Pitch_52", + "132": "Note_Pitch_53", + "133": "Note_Pitch_54", + "134": "Note_Pitch_55", + "135": "Note_Pitch_56", + "136": "Note_Pitch_57", + "137": "Note_Pitch_58", + "138": "Note_Pitch_59", + "139": "Note_Pitch_60", + "140": "Note_Pitch_61", + "141": "Note_Pitch_62", + "142": "Note_Pitch_63", + "143": "Note_Pitch_64", + "144": "Note_Pitch_65", + "145": "Note_Pitch_66", + "146": "Note_Pitch_67", + "147": "Note_Pitch_68", + "148": "Note_Pitch_69", + "149": "Note_Pitch_70", + "150": "Note_Pitch_71", + "151": "Note_Pitch_72", + "152": "Note_Pitch_73", + "153": "Note_Pitch_74", + "154": "Note_Pitch_75", + "155": "Note_Pitch_76", + "156": "Note_Pitch_77", + "157": "Note_Pitch_78", + "158": "Note_Pitch_79", + "159": "Note_Pitch_80", + "160": "Note_Pitch_81", + "161": "Note_Pitch_82", + "162": "Note_Pitch_83", + "163": "Note_Pitch_84", + "164": "Note_Pitch_85", + "165": "Note_Pitch_86", + "166": "Note_Pitch_87", + "167": "Note_Pitch_88", + "168": "Note_Pitch_89", + "169": "Note_Pitch_90", + "170": "Note_Pitch_91", + "171": "Note_Pitch_92", + "172": "Note_Pitch_93", + "173": "Note_Pitch_94", + "174": "Note_Pitch_95", + "175": "Note_Pitch_96", + "176": "Note_Pitch_97", + "177": "Note_Pitch_98", + "178": "Note_Pitch_99", + "179": "Note_Pitch_100", + "180": "Note_Pitch_101", + "181": "Note_Pitch_102", + "182": "Note_Pitch_103", + "183": "Note_Pitch_104", + "184": "Note_Pitch_105", + "185": "Note_Pitch_106", + "186": "Note_Pitch_107", + "187": "Note_Pitch_108", + "188": "Note_Pitch_109", + "189": "Note_Pitch_110", + "190": "Note_Pitch_111", + "191": "Note_Pitch_112", + "192": "Instrument_0", + "193": "Chord_A_+", + "194": "Chord_A#_+", + "195": "Chord_B_+", + "196": "Chord_C_+", + "197": "Chord_C#_+", + "198": "Chord_D_+", + "199": "Chord_D#_+", + "200": "Chord_E_+", + "201": "Chord_F_+", + "202": "Chord_F#_+", + "203": "Chord_G_+", + "204": "Chord_G#_+", + "205": "Chord_A_/o7", + "206": "Chord_A#_/o7", + "207": "Chord_B_/o7", + "208": "Chord_C_/o7", + "209": "Chord_C#_/o7", + "210": "Chord_D_/o7", + "211": "Chord_D#_/o7", + "212": "Chord_E_/o7", + "213": "Chord_F_/o7", + "214": "Chord_F#_/o7", + "215": "Chord_G_/o7", + "216": "Chord_G#_/o7", + "217": "Chord_A_7", + "218": "Chord_A#_7", + "219": "Chord_B_7", + "220": "Chord_C_7", + "221": "Chord_C#_7", + "222": "Chord_D_7", + "223": "Chord_D#_7", + "224": "Chord_E_7", + "225": "Chord_F_7", + "226": "Chord_F#_7", + "227": "Chord_G_7", + "228": "Chord_G#_7", + "229": "Chord_A_M", + "230": "Chord_A#_M", + "231": "Chord_B_M", + "232": "Chord_C_M", + "233": "Chord_C#_M", + "234": "Chord_D_M", + "235": "Chord_D#_M", + "236": "Chord_E_M", + "237": "Chord_F_M", + "238": "Chord_F#_M", + "239": "Chord_G_M", + "240": "Chord_G#_M", + "241": "Chord_A_M7", + "242": "Chord_A#_M7", + "243": "Chord_B_M7", + "244": "Chord_C_M7", + "245": "Chord_C#_M7", + "246": "Chord_D_M7", + "247": "Chord_D#_M7", + "248": "Chord_E_M7", + "249": "Chord_F_M7", + "250": "Chord_F#_M7", + "251": "Chord_G_M7", + "252": "Chord_G#_M7", + "253": "Chord_A_m", + "254": "Chord_A#_m", + "255": "Chord_B_m", + "256": "Chord_C_m", + "257": "Chord_C#_m", + "258": "Chord_D_m", + "259": "Chord_D#_m", + "260": "Chord_E_m", + "261": "Chord_F_m", + "262": "Chord_F#_m", + "263": "Chord_G_m", + "264": "Chord_G#_m", + "265": "Chord_A_m7", + "266": "Chord_A#_m7", + "267": "Chord_B_m7", + "268": "Chord_C_m7", + "269": "Chord_C#_m7", + "270": "Chord_D_m7", + "271": "Chord_D#_m7", + "272": "Chord_E_m7", + "273": "Chord_F_m7", + "274": "Chord_F#_m7", + "275": "Chord_G_m7", + "276": "Chord_G#_m7", + "277": "Chord_A_o", + "278": "Chord_A#_o", + "279": "Chord_B_o", + "280": "Chord_C_o", + "281": "Chord_C#_o", + "282": "Chord_D_o", + "283": "Chord_D#_o", + "284": "Chord_E_o", + "285": "Chord_F_o", + "286": "Chord_F#_o", + "287": "Chord_G_o", + "288": "Chord_G#_o", + "289": "Chord_A_o7", + "290": "Chord_A#_o7", + "291": "Chord_B_o7", + "292": "Chord_C_o7", + "293": "Chord_C#_o7", + "294": "Chord_D_o7", + "295": "Chord_D#_o7", + "296": "Chord_E_o7", + "297": "Chord_F_o7", + "298": "Chord_F#_o7", + "299": "Chord_G_o7", + "300": "Chord_G#_o7", + "301": "Chord_A_sus2", + "302": "Chord_A#_sus2", + "303": "Chord_B_sus2", + "304": "Chord_C_sus2", + "305": "Chord_C#_sus2", + "306": "Chord_D_sus2", + "307": "Chord_D#_sus2", + "308": "Chord_E_sus2", + "309": "Chord_F_sus2", + "310": "Chord_F#_sus2", + "311": "Chord_G_sus2", + "312": "Chord_G#_sus2", + "313": "Chord_A_sus4", + "314": "Chord_A#_sus4", + "315": "Chord_B_sus4", + "316": "Chord_C_sus4", + "317": "Chord_C#_sus4", + "318": "Chord_D_sus4", + "319": "Chord_D#_sus4", + "320": "Chord_E_sus4", + "321": "Chord_F_sus4", + "322": "Chord_F#_sus4", + "323": "Chord_G_sus4", + "324": "Chord_G#_sus4", + "325": "Chord_N_N" +} \ No newline at end of file diff --git a/vocab/vocab_Pop909/vocab_Pop909_nb8.json b/vocab/vocab_Pop909/vocab_Pop909_nb8.json new file mode 100644 index 0000000..f080385 --- /dev/null +++ b/vocab/vocab_Pop909/vocab_Pop909_nb8.json @@ -0,0 +1,350 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_6/8", + "7": "NNN_time_signature_1/8", + "8": "NNN_time_signature_4/4", + "9": "NNN_time_signature_3/4", + "10": "NNN_time_signature_2/4", + "11": "NNN_time_signature_1/4", + "12": "NNN_time_signature_2/2" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_30", + "2": "Tempo_31", + "3": "Tempo_34", + "4": "Tempo_36", + "5": "Tempo_37", + "6": "Tempo_38", + "7": "Tempo_40", + "8": "Tempo_42", + "9": "Tempo_44", + "10": "Tempo_46", + "11": "Tempo_48", + "12": "Tempo_50", + "13": "Tempo_52", + "14": "Tempo_54", + "15": "Tempo_56", + "16": "Tempo_58", + "17": "Tempo_60", + "18": "Tempo_62", + "19": "Tempo_64", + "20": "Tempo_67", + "21": "Tempo_70", + "22": "Tempo_73", + "23": "Tempo_76", + "24": "Tempo_79", + "25": "Tempo_82", + "26": "Tempo_85", + "27": "Tempo_88", + "28": "Tempo_92", + "29": "Tempo_96", + "30": "Tempo_100", + "31": "Tempo_104", + "32": "Tempo_108", + "33": "Tempo_112", + "34": "Tempo_116", + "35": "Tempo_121", + "36": "Tempo_126", + "37": "Tempo_131", + "38": "Tempo_136", + "39": "Tempo_141", + "40": "Tempo_147", + "41": "Tempo_159", + "42": "Tempo_172", + "43": "Tempo_179", + "44": "Tempo_193" + }, + "instrument": { + "0": 0, + "1": "Instrument_0" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_17", + "2": "Note_Pitch_18", + "3": "Note_Pitch_19", + "4": "Note_Pitch_20", + "5": "Note_Pitch_21", + "6": "Note_Pitch_22", + "7": "Note_Pitch_23", + "8": "Note_Pitch_24", + "9": "Note_Pitch_25", + "10": "Note_Pitch_26", + "11": "Note_Pitch_27", + "12": "Note_Pitch_28", + "13": "Note_Pitch_29", + "14": "Note_Pitch_30", + "15": "Note_Pitch_31", + "16": "Note_Pitch_32", + "17": "Note_Pitch_33", + "18": "Note_Pitch_34", + "19": "Note_Pitch_35", + "20": "Note_Pitch_36", + "21": "Note_Pitch_37", + "22": "Note_Pitch_38", + "23": "Note_Pitch_39", + "24": "Note_Pitch_40", + "25": "Note_Pitch_41", + "26": "Note_Pitch_42", + "27": "Note_Pitch_43", + "28": "Note_Pitch_44", + "29": "Note_Pitch_45", + "30": "Note_Pitch_46", + "31": "Note_Pitch_47", + "32": "Note_Pitch_48", + "33": "Note_Pitch_49", + "34": "Note_Pitch_50", + "35": "Note_Pitch_51", + "36": "Note_Pitch_52", + "37": "Note_Pitch_53", + "38": "Note_Pitch_54", + "39": "Note_Pitch_55", + "40": "Note_Pitch_56", + "41": "Note_Pitch_57", + "42": "Note_Pitch_58", + "43": "Note_Pitch_59", + "44": "Note_Pitch_60", + "45": "Note_Pitch_61", + "46": "Note_Pitch_62", + "47": "Note_Pitch_63", + "48": "Note_Pitch_64", + "49": "Note_Pitch_65", + "50": "Note_Pitch_66", + "51": "Note_Pitch_67", + "52": "Note_Pitch_68", + "53": "Note_Pitch_69", + "54": "Note_Pitch_70", + "55": "Note_Pitch_71", + "56": "Note_Pitch_72", + "57": "Note_Pitch_73", + "58": "Note_Pitch_74", + "59": "Note_Pitch_75", + "60": "Note_Pitch_76", + "61": "Note_Pitch_77", + "62": "Note_Pitch_78", + "63": "Note_Pitch_79", + "64": "Note_Pitch_80", + "65": "Note_Pitch_81", + "66": "Note_Pitch_82", + "67": "Note_Pitch_83", + "68": "Note_Pitch_84", + "69": "Note_Pitch_85", + "70": "Note_Pitch_86", + "71": "Note_Pitch_87", + "72": "Note_Pitch_88", + "73": "Note_Pitch_89", + "74": "Note_Pitch_90", + "75": "Note_Pitch_91", + "76": "Note_Pitch_92", + "77": "Note_Pitch_93", + "78": "Note_Pitch_94", + "79": "Note_Pitch_95", + "80": "Note_Pitch_96", + "81": "Note_Pitch_97", + "82": "Note_Pitch_98", + "83": "Note_Pitch_99", + "84": "Note_Pitch_100", + "85": "Note_Pitch_101", + "86": "Note_Pitch_102", + "87": "Note_Pitch_103", + "88": "Note_Pitch_104", + "89": "Note_Pitch_105", + "90": "Note_Pitch_106", + "91": "Note_Pitch_107", + "92": "Note_Pitch_108", + "93": "Note_Pitch_109", + "94": "Note_Pitch_110", + "95": "Note_Pitch_111", + "96": "Note_Pitch_112", + "97": "Note_Pitch_113", + "98": "Note_Pitch_114", + "99": "Note_Pitch_115" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_Pop909/vocab_Pop909_remi8.json b/vocab/vocab_Pop909/vocab_Pop909_remi8.json new file mode 100644 index 0000000..812601e --- /dev/null +++ b/vocab/vocab_Pop909/vocab_Pop909_remi8.json @@ -0,0 +1,312 @@ +{ + "0": "SOS_None", + "1": "EOS_None", + "2": "Bar_None", + "3": "Bar_time_signature_1/4", + "4": "Bar_time_signature_1/8", + "5": "Bar_time_signature_2/2", + "6": "Bar_time_signature_2/4", + "7": "Bar_time_signature_3/4", + "8": "Bar_time_signature_4/4", + "9": "Bar_time_signature_6/8", + "10": "Beat_0", + "11": "Beat_1", + "12": "Beat_2", + "13": "Beat_3", + "14": "Beat_4", + "15": "Beat_5", + "16": "Beat_6", + "17": "Beat_7", + "18": "Beat_8", + "19": "Beat_9", + "20": "Beat_10", + "21": "Beat_11", + "22": "Beat_12", + "23": "Beat_13", + "24": "Beat_14", + "25": "Beat_15", + "26": "Note_Duration_1", + "27": "Note_Duration_10", + "28": "Note_Duration_12", + "29": "Note_Duration_16", + "30": "Note_Duration_2", + "31": "Note_Duration_20", + "32": "Note_Duration_24", + "33": "Note_Duration_28", + "34": "Note_Duration_3", + "35": "Note_Duration_32", + "36": "Note_Duration_4", + "37": "Note_Duration_5", + "38": "Note_Duration_6", + "39": "Note_Duration_8", + "40": "Note_Velocity_100", + "41": "Note_Velocity_120", + "42": "Note_Velocity_40", + "43": "Note_Velocity_60", + "44": "Note_Velocity_80", + "45": "Tempo_100", + "46": "Tempo_104", + "47": "Tempo_108", + "48": "Tempo_112", + "49": "Tempo_116", + "50": "Tempo_121", + "51": "Tempo_126", + "52": "Tempo_131", + "53": "Tempo_136", + "54": "Tempo_141", + "55": "Tempo_147", + "56": "Tempo_159", + "57": "Tempo_172", + "58": "Tempo_179", + "59": "Tempo_193", + "60": "Tempo_30", + "61": "Tempo_31", + "62": "Tempo_34", + "63": "Tempo_36", + "64": "Tempo_37", + "65": "Tempo_38", + "66": "Tempo_40", + "67": "Tempo_42", + "68": "Tempo_44", + "69": "Tempo_46", + "70": "Tempo_48", + "71": "Tempo_50", + "72": "Tempo_52", + "73": "Tempo_54", + "74": "Tempo_56", + "75": "Tempo_58", + "76": "Tempo_60", + "77": "Tempo_62", + "78": "Tempo_64", + "79": "Tempo_67", + "80": "Tempo_70", + "81": "Tempo_73", + "82": "Tempo_76", + "83": "Tempo_79", + "84": "Tempo_82", + "85": "Tempo_85", + "86": "Tempo_88", + "87": "Tempo_92", + "88": "Tempo_96", + "89": "Note_Pitch_17", + "90": "Note_Pitch_18", + "91": "Note_Pitch_19", + "92": "Note_Pitch_20", + "93": "Note_Pitch_21", + "94": "Note_Pitch_22", + "95": "Note_Pitch_23", + "96": "Note_Pitch_24", + "97": "Note_Pitch_25", + "98": "Note_Pitch_26", + "99": "Note_Pitch_27", + "100": "Note_Pitch_28", + "101": "Note_Pitch_29", + "102": "Note_Pitch_30", + "103": "Note_Pitch_31", + "104": "Note_Pitch_32", + "105": "Note_Pitch_33", + "106": "Note_Pitch_34", + "107": "Note_Pitch_35", + "108": "Note_Pitch_36", + "109": "Note_Pitch_37", + "110": "Note_Pitch_38", + "111": "Note_Pitch_39", + "112": "Note_Pitch_40", + "113": "Note_Pitch_41", + "114": "Note_Pitch_42", + "115": "Note_Pitch_43", + "116": "Note_Pitch_44", + "117": "Note_Pitch_45", + "118": "Note_Pitch_46", + "119": "Note_Pitch_47", + "120": "Note_Pitch_48", + "121": "Note_Pitch_49", + "122": "Note_Pitch_50", + "123": "Note_Pitch_51", + "124": "Note_Pitch_52", + "125": "Note_Pitch_53", + "126": "Note_Pitch_54", + "127": "Note_Pitch_55", + "128": "Note_Pitch_56", + "129": "Note_Pitch_57", + "130": "Note_Pitch_58", + "131": "Note_Pitch_59", + "132": "Note_Pitch_60", + "133": "Note_Pitch_61", + "134": "Note_Pitch_62", + "135": "Note_Pitch_63", + "136": "Note_Pitch_64", + "137": "Note_Pitch_65", + "138": "Note_Pitch_66", + "139": "Note_Pitch_67", + "140": "Note_Pitch_68", + "141": "Note_Pitch_69", + "142": "Note_Pitch_70", + "143": "Note_Pitch_71", + "144": "Note_Pitch_72", + "145": "Note_Pitch_73", + "146": "Note_Pitch_74", + "147": "Note_Pitch_75", + "148": "Note_Pitch_76", + "149": "Note_Pitch_77", + "150": "Note_Pitch_78", + "151": "Note_Pitch_79", + "152": "Note_Pitch_80", + "153": "Note_Pitch_81", + "154": "Note_Pitch_82", + "155": "Note_Pitch_83", + "156": "Note_Pitch_84", + "157": "Note_Pitch_85", + "158": "Note_Pitch_86", + "159": "Note_Pitch_87", + "160": "Note_Pitch_88", + "161": "Note_Pitch_89", + "162": "Note_Pitch_90", + "163": "Note_Pitch_91", + "164": "Note_Pitch_92", + "165": "Note_Pitch_93", + "166": "Note_Pitch_94", + "167": "Note_Pitch_95", + "168": "Note_Pitch_96", + "169": "Note_Pitch_97", + "170": "Note_Pitch_98", + "171": "Note_Pitch_99", + "172": "Note_Pitch_100", + "173": "Note_Pitch_101", + "174": "Note_Pitch_102", + "175": "Note_Pitch_103", + "176": "Note_Pitch_104", + "177": "Note_Pitch_105", + "178": "Note_Pitch_106", + "179": "Note_Pitch_107", + "180": "Note_Pitch_108", + "181": "Note_Pitch_109", + "182": "Note_Pitch_110", + "183": "Note_Pitch_111", + "184": "Note_Pitch_112", + "185": "Note_Pitch_113", + "186": "Note_Pitch_114", + "187": "Note_Pitch_115", + "188": "Instrument_0", + "189": "Chord_A_+", + "190": "Chord_A#_+", + "191": "Chord_B_+", + "192": "Chord_C_+", + "193": "Chord_C#_+", + "194": "Chord_D_+", + "195": "Chord_D#_+", + "196": "Chord_E_+", + "197": "Chord_F_+", + "198": "Chord_F#_+", + "199": "Chord_G_+", + "200": "Chord_G#_+", + "201": "Chord_A_/o7", + "202": "Chord_A#_/o7", + "203": "Chord_B_/o7", + "204": "Chord_C_/o7", + "205": "Chord_C#_/o7", + "206": "Chord_D#_/o7", + "207": "Chord_E_/o7", + "208": "Chord_F_/o7", + "209": "Chord_F#_/o7", + "210": "Chord_G_/o7", + "211": "Chord_G#_/o7", + "212": "Chord_A_7", + "213": "Chord_A#_7", + "214": "Chord_B_7", + "215": "Chord_C_7", + "216": "Chord_C#_7", + "217": "Chord_D_7", + "218": "Chord_D#_7", + "219": "Chord_E_7", + "220": "Chord_F_7", + "221": "Chord_F#_7", + "222": "Chord_G_7", + "223": "Chord_G#_7", + "224": "Chord_A_M", + "225": "Chord_A#_M", + "226": "Chord_B_M", + "227": "Chord_C_M", + "228": "Chord_C#_M", + "229": "Chord_D_M", + "230": "Chord_D#_M", + "231": "Chord_E_M", + "232": "Chord_F_M", + "233": "Chord_F#_M", + "234": "Chord_G_M", + "235": "Chord_G#_M", + "236": "Chord_A_M7", + "237": "Chord_A#_M7", + "238": "Chord_B_M7", + "239": "Chord_C_M7", + "240": "Chord_C#_M7", + "241": "Chord_D_M7", + "242": "Chord_D#_M7", + "243": "Chord_E_M7", + "244": "Chord_F_M7", + "245": "Chord_F#_M7", + "246": "Chord_G_M7", + "247": "Chord_G#_M7", + "248": "Chord_A_m", + "249": "Chord_A#_m", + "250": "Chord_B_m", + "251": "Chord_C_m", + "252": "Chord_C#_m", + "253": "Chord_D_m", + "254": "Chord_D#_m", + "255": "Chord_E_m", + "256": "Chord_F_m", + "257": "Chord_F#_m", + "258": "Chord_G_m", + "259": "Chord_G#_m", + "260": "Chord_A_m7", + "261": "Chord_A#_m7", + "262": "Chord_B_m7", + "263": "Chord_C_m7", + "264": "Chord_C#_m7", + "265": "Chord_D_m7", + "266": "Chord_D#_m7", + "267": "Chord_E_m7", + "268": "Chord_F_m7", + "269": "Chord_F#_m7", + "270": "Chord_G_m7", + "271": "Chord_G#_m7", + "272": "Chord_A_o", + "273": "Chord_A#_o", + "274": "Chord_B_o", + "275": "Chord_C_o", + "276": "Chord_C#_o", + "277": "Chord_D_o", + "278": "Chord_D#_o", + "279": "Chord_E_o", + "280": "Chord_F_o", + "281": "Chord_F#_o", + "282": "Chord_G_o", + "283": "Chord_G#_o", + "284": "Chord_E_o7", + "285": "Chord_A_sus2", + "286": "Chord_A#_sus2", + "287": "Chord_B_sus2", + "288": "Chord_C_sus2", + "289": "Chord_C#_sus2", + "290": "Chord_D_sus2", + "291": "Chord_D#_sus2", + "292": "Chord_E_sus2", + "293": "Chord_F_sus2", + "294": "Chord_F#_sus2", + "295": "Chord_G_sus2", + "296": "Chord_G#_sus2", + "297": "Chord_A_sus4", + "298": "Chord_A#_sus4", + "299": "Chord_B_sus4", + "300": "Chord_C_sus4", + "301": "Chord_C#_sus4", + "302": "Chord_D_sus4", + "303": "Chord_D#_sus4", + "304": "Chord_E_sus4", + "305": "Chord_F_sus4", + "306": "Chord_F#_sus4", + "307": "Chord_G_sus4", + "308": "Chord_G#_sus4", + "309": "Chord_N_N" +} \ No newline at end of file diff --git a/vocab/vocab_PretrainingDataset/vocab_PretrainingDataset_nb8.json b/vocab/vocab_PretrainingDataset/vocab_PretrainingDataset_nb8.json new file mode 100644 index 0000000..9f49c7e --- /dev/null +++ b/vocab/vocab_PretrainingDataset/vocab_PretrainingDataset_nb8.json @@ -0,0 +1,557 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_1", + "3": "Instrument_2", + "4": "Instrument_3", + "5": "Instrument_4", + "6": "Instrument_5", + "7": "Instrument_6", + "8": "Instrument_7", + "9": "Instrument_8", + "10": "Instrument_9", + "11": "Instrument_10", + "12": "Instrument_11", + "13": "Instrument_12", + "14": "Instrument_13", + "15": "Instrument_14", + "16": "Instrument_15", + "17": "Instrument_16", + "18": "Instrument_17", + "19": "Instrument_18", + "20": "Instrument_19", + "21": "Instrument_20", + "22": "Instrument_21", + "23": "Instrument_22", + "24": "Instrument_23", + "25": "Instrument_24", + "26": "Instrument_25", + "27": "Instrument_26", + "28": "Instrument_27", + "29": "Instrument_28", + "30": "Instrument_29", + "31": "Instrument_30", + "32": "Instrument_31", + "33": "Instrument_32", + "34": "Instrument_33", + "35": "Instrument_34", + "36": "Instrument_35", + "37": "Instrument_36", + "38": "Instrument_37", + "39": "Instrument_38", + "40": "Instrument_39", + "41": "Instrument_40", + "42": "Instrument_41", + "43": "Instrument_42", + "44": "Instrument_43", + "45": "Instrument_44", + "46": "Instrument_45", + "47": "Instrument_46", + "48": "Instrument_47", + "49": "Instrument_48", + "50": "Instrument_49", + "51": "Instrument_50", + "52": "Instrument_51", + "53": "Instrument_52", + "54": "Instrument_53", + "55": "Instrument_54", + "56": "Instrument_55", + "57": "Instrument_56", + "58": "Instrument_57", + "59": "Instrument_58", + "60": "Instrument_59", + "61": "Instrument_60", + "62": "Instrument_61", + "63": "Instrument_62", + "64": "Instrument_63", + "65": "Instrument_64", + "66": "Instrument_65", + "67": "Instrument_66", + "68": "Instrument_67", + "69": "Instrument_68", + "70": "Instrument_69", + "71": "Instrument_70", + "72": "Instrument_71", + "73": "Instrument_72", + "74": "Instrument_73", + "75": "Instrument_74", + "76": "Instrument_75", + "77": "Instrument_76", + "78": "Instrument_77", + "79": "Instrument_78", + "80": "Instrument_79", + "81": "Instrument_80", + "82": "Instrument_81", + "83": "Instrument_82", + "84": "Instrument_83", + "85": "Instrument_84", + "86": "Instrument_85", + "87": "Instrument_86", + "88": "Instrument_87", + "89": "Instrument_88", + "90": "Instrument_89", + "91": "Instrument_90", + "92": "Instrument_91", + "93": "Instrument_92", + "94": "Instrument_93", + "95": "Instrument_94", + "96": "Instrument_95", + "97": "Instrument_96", + "98": "Instrument_97", + "99": "Instrument_98", + "100": "Instrument_99", + "101": "Instrument_100", + "102": "Instrument_101", + "103": "Instrument_102", + "104": "Instrument_103", + "105": "Instrument_104", + "106": "Instrument_105", + "107": "Instrument_106", + "108": "Instrument_107", + "109": "Instrument_108", + "110": "Instrument_109", + "111": "Instrument_110", + "112": "Instrument_111", + "113": "Instrument_112", + "114": "Instrument_113", + "115": "Instrument_114", + "116": "Instrument_115", + "117": "Instrument_116", + "118": "Instrument_117", + "119": "Instrument_118", + "120": "Instrument_119", + "121": "Instrument_120", + "122": "Instrument_121", + "123": "Instrument_122", + "124": "Instrument_123", + "125": "Instrument_124", + "126": "Instrument_125", + "127": "Instrument_126", + "128": "Instrument_127" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_SOD/vocab_SOD_cp5.json b/vocab/vocab_SOD/vocab_SOD_cp5.json new file mode 100644 index 0000000..a81d79f --- /dev/null +++ b/vocab/vocab_SOD/vocab_SOD_cp5.json @@ -0,0 +1,340 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Metrical", + "3": "Note" + }, + "beat": { + "0": 0, + "1": "Bar", + "2": "Bar_time_signature_1/1", + "3": "Bar_time_signature_1/2", + "4": "Bar_time_signature_1/4", + "5": "Bar_time_signature_1/8", + "6": "Bar_time_signature_11/8", + "7": "Bar_time_signature_12/8", + "8": "Bar_time_signature_2/2", + "9": "Bar_time_signature_2/4", + "10": "Bar_time_signature_2/8", + "11": "Bar_time_signature_3/2", + "12": "Bar_time_signature_3/4", + "13": "Bar_time_signature_3/8", + "14": "Bar_time_signature_4/2", + "15": "Bar_time_signature_4/4", + "16": "Bar_time_signature_4/8", + "17": "Bar_time_signature_5/4", + "18": "Bar_time_signature_5/8", + "19": "Bar_time_signature_6/4", + "20": "Bar_time_signature_6/8", + "21": "Bar_time_signature_7/4", + "22": "Bar_time_signature_7/8", + "23": "Bar_time_signature_8/4", + "24": "Bar_time_signature_8/8", + "25": "Bar_time_signature_9/8", + "26": "Beat_0", + "27": "Beat_1", + "28": "Beat_2", + "29": "Beat_3", + "30": "Beat_4", + "31": "Beat_5", + "32": "Beat_6", + "33": "Beat_7", + "34": "Beat_8", + "35": "Beat_9", + "36": "Beat_10", + "37": "Beat_11", + "38": "Beat_12", + "39": "Beat_13", + "40": "Beat_14", + "41": "Beat_15", + "42": "Beat_16", + "43": "Beat_17", + "44": "Beat_18", + "45": "Beat_19", + "46": "Beat_20", + "47": "Beat_21", + "48": "Beat_22", + "49": "Beat_23", + "50": "Beat_24", + "51": "Beat_25", + "52": "Beat_26", + "53": "Beat_27", + "54": "Beat_28", + "55": "Beat_29", + "56": "Beat_30", + "57": "Beat_31", + "58": "Beat_32", + "59": "Beat_33", + "60": "Beat_34", + "61": "Beat_35", + "62": "Beat_36", + "63": "Beat_37", + "64": "Beat_38", + "65": "Beat_39", + "66": "Beat_40", + "67": "Beat_41", + "68": "Beat_42", + "69": "Beat_43", + "70": "Beat_44", + "71": "Beat_45", + "72": "Beat_46", + "73": "Beat_47", + "74": "Beat_48", + "75": "Beat_49", + "76": "Beat_50", + "77": "Beat_51", + "78": "Beat_52", + "79": "Beat_53", + "80": "Beat_54", + "81": "Beat_55", + "82": "Beat_56", + "83": "Beat_57", + "84": "Beat_58", + "85": "Beat_59", + "86": "Beat_60", + "87": "Beat_61", + "88": "Beat_62", + "89": "Beat_63", + "90": "Beat_64", + "91": "Beat_65", + "92": "Beat_66", + "93": "Beat_67", + "94": "Beat_68", + "95": "Beat_69", + "96": "Beat_70", + "97": "Beat_71", + "98": "Beat_72", + "99": "Beat_73", + "100": "Beat_74", + "101": "Beat_75", + "102": "Beat_76", + "103": "Beat_77", + "104": "Beat_78", + "105": "Beat_79", + "106": "Beat_80", + "107": "Beat_81", + "108": "Beat_82", + "109": "Beat_83", + "110": "Beat_84", + "111": "Beat_85", + "112": "Beat_86", + "113": "Beat_87", + "114": "Beat_88", + "115": "Beat_89", + "116": "Beat_90", + "117": "Beat_91", + "118": "Beat_92", + "119": "Beat_93", + "120": "Beat_94", + "121": "Beat_95" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_11", + "8": "Instrument_12", + "9": "Instrument_13", + "10": "Instrument_14", + "11": "Instrument_15", + "12": "Instrument_16", + "13": "Instrument_19", + "14": "Instrument_21", + "15": "Instrument_22", + "16": "Instrument_23", + "17": "Instrument_24", + "18": "Instrument_25", + "19": "Instrument_26", + "20": "Instrument_32", + "21": "Instrument_33", + "22": "Instrument_36", + "23": "Instrument_38", + "24": "Instrument_40", + "25": "Instrument_41", + "26": "Instrument_42", + "27": "Instrument_43", + "28": "Instrument_46", + "29": "Instrument_47", + "30": "Instrument_49", + "31": "Instrument_50", + "32": "Instrument_52", + "33": "Instrument_55", + "34": "Instrument_56", + "35": "Instrument_57", + "36": "Instrument_58", + "37": "Instrument_60", + "38": "Instrument_61", + "39": "Instrument_62", + "40": "Instrument_64", + "41": "Instrument_65", + "42": "Instrument_66", + "43": "Instrument_67", + "44": "Instrument_68", + "45": "Instrument_69", + "46": "Instrument_70", + "47": "Instrument_71", + "48": "Instrument_72", + "49": "Instrument_73", + "50": "Instrument_74", + "51": "Instrument_75", + "52": "Instrument_79", + "53": "Instrument_80", + "54": "Instrument_88", + "55": "Instrument_105", + "56": "Instrument_108", + "57": "Instrument_109", + "58": "Instrument_111", + "59": "Instrument_114", + "60": "Instrument_117", + "61": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_6", + "6": "Note_Duration_9", + "7": "Note_Duration_12", + "8": "Note_Duration_15", + "9": "Note_Duration_18", + "10": "Note_Duration_24", + "11": "Note_Duration_30", + "12": "Note_Duration_36", + "13": "Note_Duration_42", + "14": "Note_Duration_48", + "15": "Note_Duration_54", + "16": "Note_Duration_60", + "17": "Note_Duration_72", + "18": "Note_Duration_84", + "19": "Note_Duration_96" + } +} \ No newline at end of file diff --git a/vocab/vocab_SOD/vocab_SOD_nb5.json b/vocab/vocab_SOD/vocab_SOD_nb5.json new file mode 100644 index 0000000..aeb5d1e --- /dev/null +++ b/vocab/vocab_SOD/vocab_SOD_nb5.json @@ -0,0 +1,341 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31", + "33": "Beat_32", + "34": "Beat_33", + "35": "Beat_34", + "36": "Beat_35", + "37": "Beat_36", + "38": "Beat_37", + "39": "Beat_38", + "40": "Beat_39", + "41": "Beat_40", + "42": "Beat_41", + "43": "Beat_42", + "44": "Beat_43", + "45": "Beat_44", + "46": "Beat_45", + "47": "Beat_46", + "48": "Beat_47", + "49": "Beat_48", + "50": "Beat_49", + "51": "Beat_50", + "52": "Beat_51", + "53": "Beat_52", + "54": "Beat_53", + "55": "Beat_54", + "56": "Beat_55", + "57": "Beat_56", + "58": "Beat_57", + "59": "Beat_58", + "60": "Beat_59", + "61": "Beat_60", + "62": "Beat_61", + "63": "Beat_62", + "64": "Beat_63", + "65": "Beat_64", + "66": "Beat_65", + "67": "Beat_66", + "68": "Beat_67", + "69": "Beat_68", + "70": "Beat_69", + "71": "Beat_70", + "72": "Beat_71", + "73": "Beat_72", + "74": "Beat_73", + "75": "Beat_74", + "76": "Beat_75", + "77": "Beat_76", + "78": "Beat_77", + "79": "Beat_78", + "80": "Beat_79", + "81": "Beat_80", + "82": "Beat_81", + "83": "Beat_82", + "84": "Beat_83", + "85": "Beat_84", + "86": "Beat_85", + "87": "Beat_86", + "88": "Beat_87", + "89": "Beat_88", + "90": "Beat_89", + "91": "Beat_90", + "92": "Beat_91", + "93": "Beat_92", + "94": "Beat_93", + "95": "Beat_94", + "96": "Beat_95" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_11", + "8": "Instrument_12", + "9": "Instrument_13", + "10": "Instrument_14", + "11": "Instrument_15", + "12": "Instrument_16", + "13": "Instrument_19", + "14": "Instrument_21", + "15": "Instrument_22", + "16": "Instrument_23", + "17": "Instrument_24", + "18": "Instrument_25", + "19": "Instrument_26", + "20": "Instrument_32", + "21": "Instrument_33", + "22": "Instrument_36", + "23": "Instrument_38", + "24": "Instrument_40", + "25": "Instrument_41", + "26": "Instrument_42", + "27": "Instrument_43", + "28": "Instrument_46", + "29": "Instrument_47", + "30": "Instrument_49", + "31": "Instrument_50", + "32": "Instrument_52", + "33": "Instrument_55", + "34": "Instrument_56", + "35": "Instrument_57", + "36": "Instrument_58", + "37": "Instrument_60", + "38": "Instrument_61", + "39": "Instrument_62", + "40": "Instrument_64", + "41": "Instrument_65", + "42": "Instrument_66", + "43": "Instrument_67", + "44": "Instrument_68", + "45": "Instrument_69", + "46": "Instrument_70", + "47": "Instrument_71", + "48": "Instrument_72", + "49": "Instrument_73", + "50": "Instrument_74", + "51": "Instrument_75", + "52": "Instrument_79", + "53": "Instrument_80", + "54": "Instrument_88", + "55": "Instrument_105", + "56": "Instrument_108", + "57": "Instrument_109", + "58": "Instrument_111", + "59": "Instrument_114", + "60": "Instrument_117", + "61": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_6", + "6": "Note_Duration_9", + "7": "Note_Duration_12", + "8": "Note_Duration_15", + "9": "Note_Duration_18", + "10": "Note_Duration_24", + "11": "Note_Duration_30", + "12": "Note_Duration_36", + "13": "Note_Duration_42", + "14": "Note_Duration_48", + "15": "Note_Duration_54", + "16": "Note_Duration_60", + "17": "Note_Duration_72", + "18": "Note_Duration_84", + "19": "Note_Duration_96" + } +} \ No newline at end of file diff --git a/vocab/vocab_SOD/vocab_SOD_nb8.json b/vocab/vocab_SOD/vocab_SOD_nb8.json new file mode 100644 index 0000000..e3a24fb --- /dev/null +++ b/vocab/vocab_SOD/vocab_SOD_nb8.json @@ -0,0 +1,572 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31", + "33": "Beat_32", + "34": "Beat_33", + "35": "Beat_34", + "36": "Beat_35", + "37": "Beat_36", + "38": "Beat_37", + "39": "Beat_38", + "40": "Beat_39", + "41": "Beat_40", + "42": "Beat_41", + "43": "Beat_42", + "44": "Beat_43", + "45": "Beat_44", + "46": "Beat_45", + "47": "Beat_46", + "48": "Beat_47", + "49": "Beat_48", + "50": "Beat_49", + "51": "Beat_50", + "52": "Beat_51", + "53": "Beat_52", + "54": "Beat_53", + "55": "Beat_54", + "56": "Beat_55", + "57": "Beat_56", + "58": "Beat_57", + "59": "Beat_58", + "60": "Beat_59", + "61": "Beat_60", + "62": "Beat_61", + "63": "Beat_62", + "64": "Beat_63", + "65": "Beat_64", + "66": "Beat_65", + "67": "Beat_66", + "68": "Beat_67", + "69": "Beat_68", + "70": "Beat_69", + "71": "Beat_70", + "72": "Beat_71", + "73": "Beat_72", + "74": "Beat_73", + "75": "Beat_74", + "76": "Beat_75", + "77": "Beat_76", + "78": "Beat_77", + "79": "Beat_78", + "80": "Beat_79", + "81": "Beat_80", + "82": "Beat_81", + "83": "Beat_82", + "84": "Beat_83", + "85": "Beat_84", + "86": "Beat_85", + "87": "Beat_86", + "88": "Beat_87", + "89": "Beat_88", + "90": "Beat_89", + "91": "Beat_90", + "92": "Beat_91", + "93": "Beat_92", + "94": "Beat_93", + "95": "Beat_94", + "96": "Beat_95" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_30", + "2": "Tempo_31", + "3": "Tempo_32", + "4": "Tempo_33", + "5": "Tempo_34", + "6": "Tempo_35", + "7": "Tempo_36", + "8": "Tempo_37", + "9": "Tempo_38", + "10": "Tempo_40", + "11": "Tempo_42", + "12": "Tempo_44", + "13": "Tempo_46", + "14": "Tempo_48", + "15": "Tempo_50", + "16": "Tempo_52", + "17": "Tempo_53", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_77", + "29": "Tempo_79", + "30": "Tempo_82", + "31": "Tempo_85", + "32": "Tempo_88", + "33": "Tempo_92", + "34": "Tempo_94", + "35": "Tempo_96", + "36": "Tempo_100", + "37": "Tempo_103", + "38": "Tempo_104", + "39": "Tempo_108", + "40": "Tempo_112", + "41": "Tempo_113", + "42": "Tempo_116", + "43": "Tempo_121", + "44": "Tempo_124", + "45": "Tempo_126", + "46": "Tempo_131", + "47": "Tempo_136", + "48": "Tempo_141", + "49": "Tempo_147", + "50": "Tempo_150", + "51": "Tempo_153", + "52": "Tempo_159", + "53": "Tempo_165", + "54": "Tempo_172", + "55": "Tempo_179", + "56": "Tempo_182", + "57": "Tempo_186", + "58": "Tempo_193", + "59": "Tempo_200", + "60": "Tempo_201", + "61": "Tempo_209", + "62": "Tempo_217", + "63": "Tempo_220", + "64": "Tempo_226", + "65": "Tempo_235", + "66": "Tempo_242", + "67": "Tempo_244", + "68": "Tempo_254", + "69": "Tempo_264", + "70": "Tempo_266", + "71": "Tempo_275", + "72": "Tempo_286", + "73": "Tempo_293", + "74": "Tempo_297", + "75": "Tempo_309", + "76": "Tempo_321", + "77": "Tempo_322", + "78": "Tempo_334", + "79": "Tempo_347", + "80": "Tempo_354", + "81": "Tempo_361", + "82": "Tempo_375", + "83": "Tempo_389", + "84": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_11", + "8": "Instrument_12", + "9": "Instrument_13", + "10": "Instrument_14", + "11": "Instrument_15", + "12": "Instrument_16", + "13": "Instrument_19", + "14": "Instrument_21", + "15": "Instrument_22", + "16": "Instrument_23", + "17": "Instrument_24", + "18": "Instrument_25", + "19": "Instrument_26", + "20": "Instrument_32", + "21": "Instrument_33", + "22": "Instrument_36", + "23": "Instrument_38", + "24": "Instrument_40", + "25": "Instrument_41", + "26": "Instrument_42", + "27": "Instrument_43", + "28": "Instrument_46", + "29": "Instrument_47", + "30": "Instrument_49", + "31": "Instrument_50", + "32": "Instrument_52", + "33": "Instrument_55", + "34": "Instrument_56", + "35": "Instrument_57", + "36": "Instrument_58", + "37": "Instrument_60", + "38": "Instrument_61", + "39": "Instrument_62", + "40": "Instrument_64", + "41": "Instrument_65", + "42": "Instrument_66", + "43": "Instrument_67", + "44": "Instrument_68", + "45": "Instrument_69", + "46": "Instrument_70", + "47": "Instrument_71", + "48": "Instrument_72", + "49": "Instrument_73", + "50": "Instrument_74", + "51": "Instrument_75", + "52": "Instrument_79", + "53": "Instrument_80", + "54": "Instrument_88", + "55": "Instrument_105", + "56": "Instrument_108", + "57": "Instrument_109", + "58": "Instrument_111", + "59": "Instrument_114", + "60": "Instrument_117", + "61": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_6", + "6": "Note_Duration_9", + "7": "Note_Duration_12", + "8": "Note_Duration_15", + "9": "Note_Duration_18", + "10": "Note_Duration_24", + "11": "Note_Duration_30", + "12": "Note_Duration_36", + "13": "Note_Duration_42", + "14": "Note_Duration_48", + "15": "Note_Duration_54", + "16": "Note_Duration_60", + "17": "Note_Duration_72", + "18": "Note_Duration_84", + "19": "Note_Duration_96" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_SOD/vocab_SOD_remi5.json b/vocab/vocab_SOD/vocab_SOD_remi5.json new file mode 100644 index 0000000..c6373bd --- /dev/null +++ b/vocab/vocab_SOD/vocab_SOD_remi5.json @@ -0,0 +1,324 @@ +{ + "0": "SOS_None", + "1": "EOS_None", + "2": "Bar_None", + "3": "Bar_time_signature_1/1", + "4": "Bar_time_signature_1/2", + "5": "Bar_time_signature_1/4", + "6": "Bar_time_signature_1/8", + "7": "Bar_time_signature_11/8", + "8": "Bar_time_signature_12/8", + "9": "Bar_time_signature_2/2", + "10": "Bar_time_signature_2/4", + "11": "Bar_time_signature_2/8", + "12": "Bar_time_signature_3/2", + "13": "Bar_time_signature_3/4", + "14": "Bar_time_signature_3/8", + "15": "Bar_time_signature_4/2", + "16": "Bar_time_signature_4/4", + "17": "Bar_time_signature_4/8", + "18": "Bar_time_signature_5/4", + "19": "Bar_time_signature_5/8", + "20": "Bar_time_signature_6/4", + "21": "Bar_time_signature_6/8", + "22": "Bar_time_signature_7/4", + "23": "Bar_time_signature_7/8", + "24": "Bar_time_signature_8/4", + "25": "Bar_time_signature_8/8", + "26": "Bar_time_signature_9/8", + "27": "Beat_0", + "28": "Beat_1", + "29": "Beat_2", + "30": "Beat_3", + "31": "Beat_4", + "32": "Beat_5", + "33": "Beat_6", + "34": "Beat_7", + "35": "Beat_8", + "36": "Beat_9", + "37": "Beat_10", + "38": "Beat_11", + "39": "Beat_12", + "40": "Beat_13", + "41": "Beat_14", + "42": "Beat_15", + "43": "Beat_16", + "44": "Beat_17", + "45": "Beat_18", + "46": "Beat_19", + "47": "Beat_20", + "48": "Beat_21", + "49": "Beat_22", + "50": "Beat_23", + "51": "Beat_24", + "52": "Beat_25", + "53": "Beat_26", + "54": "Beat_27", + "55": "Beat_28", + "56": "Beat_29", + "57": "Beat_30", + "58": "Beat_31", + "59": "Beat_32", + "60": "Beat_33", + "61": "Beat_34", + "62": "Beat_35", + "63": "Beat_36", + "64": "Beat_37", + "65": "Beat_38", + "66": "Beat_39", + "67": "Beat_40", + "68": "Beat_41", + "69": "Beat_42", + "70": "Beat_43", + "71": "Beat_44", + "72": "Beat_45", + "73": "Beat_46", + "74": "Beat_47", + "75": "Beat_48", + "76": "Beat_49", + "77": "Beat_50", + "78": "Beat_51", + "79": "Beat_52", + "80": "Beat_53", + "81": "Beat_54", + "82": "Beat_55", + "83": "Beat_56", + "84": "Beat_57", + "85": "Beat_58", + "86": "Beat_59", + "87": "Beat_60", + "88": "Beat_61", + "89": "Beat_62", + "90": "Beat_63", + "91": "Beat_64", + "92": "Beat_65", + "93": "Beat_66", + "94": "Beat_67", + "95": "Beat_68", + "96": "Beat_69", + "97": "Beat_70", + "98": "Beat_71", + "99": "Beat_72", + "100": "Beat_73", + "101": "Beat_74", + "102": "Beat_75", + "103": "Beat_76", + "104": "Beat_77", + "105": "Beat_78", + "106": "Beat_79", + "107": "Beat_80", + "108": "Beat_81", + "109": "Beat_82", + "110": "Beat_83", + "111": "Beat_84", + "112": "Beat_85", + "113": "Beat_86", + "114": "Beat_87", + "115": "Beat_88", + "116": "Beat_89", + "117": "Beat_90", + "118": "Beat_91", + "119": "Beat_92", + "120": "Beat_93", + "121": "Beat_94", + "122": "Beat_95", + "123": "Note_Duration_1", + "124": "Note_Duration_12", + "125": "Note_Duration_15", + "126": "Note_Duration_18", + "127": "Note_Duration_2", + "128": "Note_Duration_24", + "129": "Note_Duration_3", + "130": "Note_Duration_30", + "131": "Note_Duration_36", + "132": "Note_Duration_4", + "133": "Note_Duration_42", + "134": "Note_Duration_48", + "135": "Note_Duration_54", + "136": "Note_Duration_6", + "137": "Note_Duration_60", + "138": "Note_Duration_72", + "139": "Note_Duration_84", + "140": "Note_Duration_9", + "141": "Note_Duration_96", + "142": "Note_Pitch_6", + "143": "Note_Pitch_7", + "144": "Note_Pitch_8", + "145": "Note_Pitch_9", + "146": "Note_Pitch_10", + "147": "Note_Pitch_11", + "148": "Note_Pitch_12", + "149": "Note_Pitch_13", + "150": "Note_Pitch_14", + "151": "Note_Pitch_15", + "152": "Note_Pitch_16", + "153": "Note_Pitch_17", + "154": "Note_Pitch_18", + "155": "Note_Pitch_19", + "156": "Note_Pitch_20", + "157": "Note_Pitch_21", + "158": "Note_Pitch_22", + "159": "Note_Pitch_23", + "160": "Note_Pitch_24", + "161": "Note_Pitch_25", + "162": "Note_Pitch_26", + "163": "Note_Pitch_27", + "164": "Note_Pitch_28", + "165": "Note_Pitch_29", + "166": "Note_Pitch_30", + "167": "Note_Pitch_31", + "168": "Note_Pitch_32", + "169": "Note_Pitch_33", + "170": "Note_Pitch_34", + "171": "Note_Pitch_35", + "172": "Note_Pitch_36", + "173": "Note_Pitch_37", + "174": "Note_Pitch_38", + "175": "Note_Pitch_39", + "176": "Note_Pitch_40", + "177": "Note_Pitch_41", + "178": "Note_Pitch_42", + "179": "Note_Pitch_43", + "180": "Note_Pitch_44", + "181": "Note_Pitch_45", + "182": "Note_Pitch_46", + "183": "Note_Pitch_47", + "184": "Note_Pitch_48", + "185": "Note_Pitch_49", + "186": "Note_Pitch_50", + "187": "Note_Pitch_51", + "188": "Note_Pitch_52", + "189": "Note_Pitch_53", + "190": "Note_Pitch_54", + "191": "Note_Pitch_55", + "192": "Note_Pitch_56", + "193": "Note_Pitch_57", + "194": "Note_Pitch_58", + "195": "Note_Pitch_59", + "196": "Note_Pitch_60", + "197": "Note_Pitch_61", + "198": "Note_Pitch_62", + "199": "Note_Pitch_63", + "200": "Note_Pitch_64", + "201": "Note_Pitch_65", + "202": "Note_Pitch_66", + "203": "Note_Pitch_67", + "204": "Note_Pitch_68", + "205": "Note_Pitch_69", + "206": "Note_Pitch_70", + "207": "Note_Pitch_71", + "208": "Note_Pitch_72", + "209": "Note_Pitch_73", + "210": "Note_Pitch_74", + "211": "Note_Pitch_75", + "212": "Note_Pitch_76", + "213": "Note_Pitch_77", + "214": "Note_Pitch_78", + "215": "Note_Pitch_79", + "216": "Note_Pitch_80", + "217": "Note_Pitch_81", + "218": "Note_Pitch_82", + "219": "Note_Pitch_83", + "220": "Note_Pitch_84", + "221": "Note_Pitch_85", + "222": "Note_Pitch_86", + "223": "Note_Pitch_87", + "224": "Note_Pitch_88", + "225": "Note_Pitch_89", + "226": "Note_Pitch_90", + "227": "Note_Pitch_91", + "228": "Note_Pitch_92", + "229": "Note_Pitch_93", + "230": "Note_Pitch_94", + "231": "Note_Pitch_95", + "232": "Note_Pitch_96", + "233": "Note_Pitch_97", + "234": "Note_Pitch_98", + "235": "Note_Pitch_99", + "236": "Note_Pitch_100", + "237": "Note_Pitch_101", + "238": "Note_Pitch_102", + "239": "Note_Pitch_103", + "240": "Note_Pitch_104", + "241": "Note_Pitch_105", + "242": "Note_Pitch_106", + "243": "Note_Pitch_107", + "244": "Note_Pitch_108", + "245": "Note_Pitch_109", + "246": "Note_Pitch_110", + "247": "Note_Pitch_111", + "248": "Note_Pitch_112", + "249": "Note_Pitch_113", + "250": "Note_Pitch_114", + "251": "Note_Pitch_115", + "252": "Note_Pitch_116", + "253": "Note_Pitch_117", + "254": "Note_Pitch_118", + "255": "Note_Pitch_119", + "256": "Note_Pitch_120", + "257": "Note_Pitch_121", + "258": "Note_Pitch_122", + "259": "Note_Pitch_123", + "260": "Note_Pitch_124", + "261": "Instrument_0", + "262": "Instrument_4", + "263": "Instrument_6", + "264": "Instrument_7", + "265": "Instrument_8", + "266": "Instrument_9", + "267": "Instrument_11", + "268": "Instrument_12", + "269": "Instrument_13", + "270": "Instrument_14", + "271": "Instrument_15", + "272": "Instrument_16", + "273": "Instrument_19", + "274": "Instrument_21", + "275": "Instrument_22", + "276": "Instrument_23", + "277": "Instrument_24", + "278": "Instrument_25", + "279": "Instrument_26", + "280": "Instrument_32", + "281": "Instrument_33", + "282": "Instrument_36", + "283": "Instrument_38", + "284": "Instrument_40", + "285": "Instrument_41", + "286": "Instrument_42", + "287": "Instrument_43", + "288": "Instrument_46", + "289": "Instrument_47", + "290": "Instrument_49", + "291": "Instrument_50", + "292": "Instrument_52", + "293": "Instrument_55", + "294": "Instrument_56", + "295": "Instrument_57", + "296": "Instrument_58", + "297": "Instrument_60", + "298": "Instrument_61", + "299": "Instrument_62", + "300": "Instrument_64", + "301": "Instrument_65", + "302": "Instrument_66", + "303": "Instrument_67", + "304": "Instrument_68", + "305": "Instrument_69", + "306": "Instrument_70", + "307": "Instrument_71", + "308": "Instrument_72", + "309": "Instrument_73", + "310": "Instrument_74", + "311": "Instrument_75", + "312": "Instrument_79", + "313": "Instrument_80", + "314": "Instrument_88", + "315": "Instrument_105", + "316": "Instrument_108", + "317": "Instrument_109", + "318": "Instrument_111", + "319": "Instrument_114", + "320": "Instrument_117", + "321": "Instrument_118" +} \ No newline at end of file diff --git a/vocab/vocab_SOD/vocab_SOD_remi8.json b/vocab/vocab_SOD/vocab_SOD_remi8.json new file mode 100644 index 0000000..e5e252a --- /dev/null +++ b/vocab/vocab_SOD/vocab_SOD_remi8.json @@ -0,0 +1,546 @@ +{ + "0": "SOS_None", + "1": "EOS_None", + "2": "Bar_None", + "3": "Bar_time_signature_1/1", + "4": "Bar_time_signature_1/2", + "5": "Bar_time_signature_1/4", + "6": "Bar_time_signature_1/8", + "7": "Bar_time_signature_11/8", + "8": "Bar_time_signature_12/8", + "9": "Bar_time_signature_2/2", + "10": "Bar_time_signature_2/4", + "11": "Bar_time_signature_2/8", + "12": "Bar_time_signature_3/2", + "13": "Bar_time_signature_3/4", + "14": "Bar_time_signature_3/8", + "15": "Bar_time_signature_4/2", + "16": "Bar_time_signature_4/4", + "17": "Bar_time_signature_4/8", + "18": "Bar_time_signature_5/4", + "19": "Bar_time_signature_5/8", + "20": "Bar_time_signature_6/4", + "21": "Bar_time_signature_6/8", + "22": "Bar_time_signature_7/4", + "23": "Bar_time_signature_7/8", + "24": "Bar_time_signature_8/4", + "25": "Bar_time_signature_8/8", + "26": "Bar_time_signature_9/8", + "27": "Beat_0", + "28": "Beat_1", + "29": "Beat_2", + "30": "Beat_3", + "31": "Beat_4", + "32": "Beat_5", + "33": "Beat_6", + "34": "Beat_7", + "35": "Beat_8", + "36": "Beat_9", + "37": "Beat_10", + "38": "Beat_11", + "39": "Beat_12", + "40": "Beat_13", + "41": "Beat_14", + "42": "Beat_15", + "43": "Beat_16", + "44": "Beat_17", + "45": "Beat_18", + "46": "Beat_19", + "47": "Beat_20", + "48": "Beat_21", + "49": "Beat_22", + "50": "Beat_23", + "51": "Beat_24", + "52": "Beat_25", + "53": "Beat_26", + "54": "Beat_27", + "55": "Beat_28", + "56": "Beat_29", + "57": "Beat_30", + "58": "Beat_31", + "59": "Beat_32", + "60": "Beat_33", + "61": "Beat_34", + "62": "Beat_35", + "63": "Beat_36", + "64": "Beat_37", + "65": "Beat_38", + "66": "Beat_39", + "67": "Beat_40", + "68": "Beat_41", + "69": "Beat_42", + "70": "Beat_43", + "71": "Beat_44", + "72": "Beat_45", + "73": "Beat_46", + "74": "Beat_47", + "75": "Beat_48", + "76": "Beat_49", + "77": "Beat_50", + "78": "Beat_51", + "79": "Beat_52", + "80": "Beat_53", + "81": "Beat_54", + "82": "Beat_55", + "83": "Beat_56", + "84": "Beat_57", + "85": "Beat_58", + "86": "Beat_59", + "87": "Beat_60", + "88": "Beat_61", + "89": "Beat_62", + "90": "Beat_63", + "91": "Beat_64", + "92": "Beat_65", + "93": "Beat_66", + "94": "Beat_67", + "95": "Beat_68", + "96": "Beat_69", + "97": "Beat_70", + "98": "Beat_71", + "99": "Beat_72", + "100": "Beat_73", + "101": "Beat_74", + "102": "Beat_75", + "103": "Beat_76", + "104": "Beat_77", + "105": "Beat_78", + "106": "Beat_79", + "107": "Beat_80", + "108": "Beat_81", + "109": "Beat_82", + "110": "Beat_83", + "111": "Beat_84", + "112": "Beat_85", + "113": "Beat_86", + "114": "Beat_87", + "115": "Beat_88", + "116": "Beat_89", + "117": "Beat_90", + "118": "Beat_91", + "119": "Beat_92", + "120": "Beat_93", + "121": "Beat_94", + "122": "Beat_95", + "123": "Note_Duration_1", + "124": "Note_Duration_12", + "125": "Note_Duration_15", + "126": "Note_Duration_18", + "127": "Note_Duration_2", + "128": "Note_Duration_24", + "129": "Note_Duration_3", + "130": "Note_Duration_30", + "131": "Note_Duration_36", + "132": "Note_Duration_4", + "133": "Note_Duration_42", + "134": "Note_Duration_48", + "135": "Note_Duration_54", + "136": "Note_Duration_6", + "137": "Note_Duration_60", + "138": "Note_Duration_72", + "139": "Note_Duration_84", + "140": "Note_Duration_9", + "141": "Note_Duration_96", + "142": "Note_Velocity_100", + "143": "Note_Velocity_120", + "144": "Note_Velocity_40", + "145": "Note_Velocity_60", + "146": "Note_Velocity_80", + "147": "Tempo_100", + "148": "Tempo_103", + "149": "Tempo_104", + "150": "Tempo_108", + "151": "Tempo_112", + "152": "Tempo_113", + "153": "Tempo_116", + "154": "Tempo_121", + "155": "Tempo_124", + "156": "Tempo_126", + "157": "Tempo_131", + "158": "Tempo_136", + "159": "Tempo_141", + "160": "Tempo_147", + "161": "Tempo_150", + "162": "Tempo_153", + "163": "Tempo_159", + "164": "Tempo_165", + "165": "Tempo_172", + "166": "Tempo_179", + "167": "Tempo_182", + "168": "Tempo_186", + "169": "Tempo_193", + "170": "Tempo_200", + "171": "Tempo_201", + "172": "Tempo_209", + "173": "Tempo_217", + "174": "Tempo_220", + "175": "Tempo_226", + "176": "Tempo_235", + "177": "Tempo_242", + "178": "Tempo_244", + "179": "Tempo_254", + "180": "Tempo_264", + "181": "Tempo_266", + "182": "Tempo_275", + "183": "Tempo_286", + "184": "Tempo_293", + "185": "Tempo_297", + "186": "Tempo_30", + "187": "Tempo_309", + "188": "Tempo_31", + "189": "Tempo_32", + "190": "Tempo_321", + "191": "Tempo_322", + "192": "Tempo_33", + "193": "Tempo_334", + "194": "Tempo_34", + "195": "Tempo_347", + "196": "Tempo_35", + "197": "Tempo_354", + "198": "Tempo_36", + "199": "Tempo_361", + "200": "Tempo_37", + "201": "Tempo_375", + "202": "Tempo_38", + "203": "Tempo_389", + "204": "Tempo_390", + "205": "Tempo_40", + "206": "Tempo_42", + "207": "Tempo_44", + "208": "Tempo_46", + "209": "Tempo_48", + "210": "Tempo_50", + "211": "Tempo_52", + "212": "Tempo_53", + "213": "Tempo_54", + "214": "Tempo_56", + "215": "Tempo_58", + "216": "Tempo_60", + "217": "Tempo_62", + "218": "Tempo_64", + "219": "Tempo_67", + "220": "Tempo_70", + "221": "Tempo_73", + "222": "Tempo_76", + "223": "Tempo_77", + "224": "Tempo_79", + "225": "Tempo_82", + "226": "Tempo_85", + "227": "Tempo_88", + "228": "Tempo_92", + "229": "Tempo_94", + "230": "Tempo_96", + "231": "Note_Pitch_6", + "232": "Note_Pitch_7", + "233": "Note_Pitch_8", + "234": "Note_Pitch_9", + "235": "Note_Pitch_10", + "236": "Note_Pitch_11", + "237": "Note_Pitch_12", + "238": "Note_Pitch_13", + "239": "Note_Pitch_14", + "240": "Note_Pitch_15", + "241": "Note_Pitch_16", + "242": "Note_Pitch_17", + "243": "Note_Pitch_18", + "244": "Note_Pitch_19", + "245": "Note_Pitch_20", + "246": "Note_Pitch_21", + "247": "Note_Pitch_22", + "248": "Note_Pitch_23", + "249": "Note_Pitch_24", + "250": "Note_Pitch_25", + "251": "Note_Pitch_26", + "252": "Note_Pitch_27", + "253": "Note_Pitch_28", + "254": "Note_Pitch_29", + "255": "Note_Pitch_30", + "256": "Note_Pitch_31", + "257": "Note_Pitch_32", + "258": "Note_Pitch_33", + "259": "Note_Pitch_34", + "260": "Note_Pitch_35", + "261": "Note_Pitch_36", + "262": "Note_Pitch_37", + "263": "Note_Pitch_38", + "264": "Note_Pitch_39", + "265": "Note_Pitch_40", + "266": "Note_Pitch_41", + "267": "Note_Pitch_42", + "268": "Note_Pitch_43", + "269": "Note_Pitch_44", + "270": "Note_Pitch_45", + "271": "Note_Pitch_46", + "272": "Note_Pitch_47", + "273": "Note_Pitch_48", + "274": "Note_Pitch_49", + "275": "Note_Pitch_50", + "276": "Note_Pitch_51", + "277": "Note_Pitch_52", + "278": "Note_Pitch_53", + "279": "Note_Pitch_54", + "280": "Note_Pitch_55", + "281": "Note_Pitch_56", + "282": "Note_Pitch_57", + "283": "Note_Pitch_58", + "284": "Note_Pitch_59", + "285": "Note_Pitch_60", + "286": "Note_Pitch_61", + "287": "Note_Pitch_62", + "288": "Note_Pitch_63", + "289": "Note_Pitch_64", + "290": "Note_Pitch_65", + "291": "Note_Pitch_66", + "292": "Note_Pitch_67", + "293": "Note_Pitch_68", + "294": "Note_Pitch_69", + "295": "Note_Pitch_70", + "296": "Note_Pitch_71", + "297": "Note_Pitch_72", + "298": "Note_Pitch_73", + "299": "Note_Pitch_74", + "300": "Note_Pitch_75", + "301": "Note_Pitch_76", + "302": "Note_Pitch_77", + "303": "Note_Pitch_78", + "304": "Note_Pitch_79", + "305": "Note_Pitch_80", + "306": "Note_Pitch_81", + "307": "Note_Pitch_82", + "308": "Note_Pitch_83", + "309": "Note_Pitch_84", + "310": "Note_Pitch_85", + "311": "Note_Pitch_86", + "312": "Note_Pitch_87", + "313": "Note_Pitch_88", + "314": "Note_Pitch_89", + "315": "Note_Pitch_90", + "316": "Note_Pitch_91", + "317": "Note_Pitch_92", + "318": "Note_Pitch_93", + "319": "Note_Pitch_94", + "320": "Note_Pitch_95", + "321": "Note_Pitch_96", + "322": "Note_Pitch_97", + "323": "Note_Pitch_98", + "324": "Note_Pitch_99", + "325": "Note_Pitch_100", + "326": "Note_Pitch_101", + "327": "Note_Pitch_102", + "328": "Note_Pitch_103", + "329": "Note_Pitch_104", + "330": "Note_Pitch_105", + "331": "Note_Pitch_106", + "332": "Note_Pitch_107", + "333": "Note_Pitch_108", + "334": "Note_Pitch_109", + "335": "Note_Pitch_110", + "336": "Note_Pitch_111", + "337": "Note_Pitch_112", + "338": "Note_Pitch_113", + "339": "Note_Pitch_114", + "340": "Note_Pitch_115", + "341": "Note_Pitch_116", + "342": "Note_Pitch_117", + "343": "Note_Pitch_118", + "344": "Note_Pitch_119", + "345": "Note_Pitch_120", + "346": "Note_Pitch_121", + "347": "Note_Pitch_122", + "348": "Note_Pitch_123", + "349": "Note_Pitch_124", + "350": "Instrument_0", + "351": "Instrument_4", + "352": "Instrument_6", + "353": "Instrument_7", + "354": "Instrument_8", + "355": "Instrument_9", + "356": "Instrument_11", + "357": "Instrument_12", + "358": "Instrument_13", + "359": "Instrument_14", + "360": "Instrument_15", + "361": "Instrument_16", + "362": "Instrument_19", + "363": "Instrument_21", + "364": "Instrument_22", + "365": "Instrument_23", + "366": "Instrument_24", + "367": "Instrument_25", + "368": "Instrument_26", + "369": "Instrument_32", + "370": "Instrument_33", + "371": "Instrument_36", + "372": "Instrument_38", + "373": "Instrument_40", + "374": "Instrument_41", + "375": "Instrument_42", + "376": "Instrument_43", + "377": "Instrument_46", + "378": "Instrument_47", + "379": "Instrument_49", + "380": "Instrument_50", + "381": "Instrument_52", + "382": "Instrument_55", + "383": "Instrument_56", + "384": "Instrument_57", + "385": "Instrument_58", + "386": "Instrument_60", + "387": "Instrument_61", + "388": "Instrument_62", + "389": "Instrument_64", + "390": "Instrument_65", + "391": "Instrument_66", + "392": "Instrument_67", + "393": "Instrument_68", + "394": "Instrument_69", + "395": "Instrument_70", + "396": "Instrument_71", + "397": "Instrument_72", + "398": "Instrument_73", + "399": "Instrument_74", + "400": "Instrument_75", + "401": "Instrument_79", + "402": "Instrument_80", + "403": "Instrument_88", + "404": "Instrument_105", + "405": "Instrument_108", + "406": "Instrument_109", + "407": "Instrument_111", + "408": "Instrument_114", + "409": "Instrument_117", + "410": "Instrument_118", + "411": "Chord_A_+", + "412": "Chord_A#_+", + "413": "Chord_B_+", + "414": "Chord_C_+", + "415": "Chord_C#_+", + "416": "Chord_D_+", + "417": "Chord_D#_+", + "418": "Chord_E_+", + "419": "Chord_F_+", + "420": "Chord_F#_+", + "421": "Chord_G_+", + "422": "Chord_G#_+", + "423": "Chord_A_/o7", + "424": "Chord_A#_/o7", + "425": "Chord_B_/o7", + "426": "Chord_C_/o7", + "427": "Chord_C#_/o7", + "428": "Chord_D_/o7", + "429": "Chord_D#_/o7", + "430": "Chord_E_/o7", + "431": "Chord_F_/o7", + "432": "Chord_F#_/o7", + "433": "Chord_G_/o7", + "434": "Chord_G#_/o7", + "435": "Chord_A_7", + "436": "Chord_A#_7", + "437": "Chord_B_7", + "438": "Chord_C_7", + "439": "Chord_C#_7", + "440": "Chord_D_7", + "441": "Chord_D#_7", + "442": "Chord_E_7", + "443": "Chord_F_7", + "444": "Chord_F#_7", + "445": "Chord_G_7", + "446": "Chord_G#_7", + "447": "Chord_A_M", + "448": "Chord_A#_M", + "449": "Chord_B_M", + "450": "Chord_C_M", + "451": "Chord_C#_M", + "452": "Chord_D_M", + "453": "Chord_D#_M", + "454": "Chord_E_M", + "455": "Chord_F_M", + "456": "Chord_F#_M", + "457": "Chord_G_M", + "458": "Chord_G#_M", + "459": "Chord_A_M7", + "460": "Chord_A#_M7", + "461": "Chord_B_M7", + "462": "Chord_C_M7", + "463": "Chord_C#_M7", + "464": "Chord_D_M7", + "465": "Chord_D#_M7", + "466": "Chord_E_M7", + "467": "Chord_F_M7", + "468": "Chord_F#_M7", + "469": "Chord_G_M7", + "470": "Chord_G#_M7", + "471": "Chord_A_m", + "472": "Chord_A#_m", + "473": "Chord_B_m", + "474": "Chord_C_m", + "475": "Chord_C#_m", + "476": "Chord_D_m", + "477": "Chord_D#_m", + "478": "Chord_E_m", + "479": "Chord_F_m", + "480": "Chord_F#_m", + "481": "Chord_G_m", + "482": "Chord_G#_m", + "483": "Chord_A_m7", + "484": "Chord_A#_m7", + "485": "Chord_B_m7", + "486": "Chord_C_m7", + "487": "Chord_C#_m7", + "488": "Chord_D_m7", + "489": "Chord_D#_m7", + "490": "Chord_E_m7", + "491": "Chord_F_m7", + "492": "Chord_F#_m7", + "493": "Chord_G_m7", + "494": "Chord_G#_m7", + "495": "Chord_A_o", + "496": "Chord_A#_o", + "497": "Chord_B_o", + "498": "Chord_C_o", + "499": "Chord_C#_o", + "500": "Chord_D_o", + "501": "Chord_D#_o", + "502": "Chord_E_o", + "503": "Chord_F_o", + "504": "Chord_F#_o", + "505": "Chord_G_o", + "506": "Chord_G#_o", + "507": "Chord_A_o7", + "508": "Chord_A#_o7", + "509": "Chord_B_o7", + "510": "Chord_C_o7", + "511": "Chord_C#_o7", + "512": "Chord_D_o7", + "513": "Chord_D#_o7", + "514": "Chord_E_o7", + "515": "Chord_F_o7", + "516": "Chord_F#_o7", + "517": "Chord_G_o7", + "518": "Chord_G#_o7", + "519": "Chord_A_sus2", + "520": "Chord_A#_sus2", + "521": "Chord_B_sus2", + "522": "Chord_C_sus2", + "523": "Chord_C#_sus2", + "524": "Chord_D_sus2", + "525": "Chord_D#_sus2", + "526": "Chord_E_sus2", + "527": "Chord_F_sus2", + "528": "Chord_F#_sus2", + "529": "Chord_G_sus2", + "530": "Chord_G#_sus2", + "531": "Chord_A_sus4", + "532": "Chord_A#_sus4", + "533": "Chord_B_sus4", + "534": "Chord_C_sus4", + "535": "Chord_C#_sus4", + "536": "Chord_D_sus4", + "537": "Chord_D#_sus4", + "538": "Chord_E_sus4", + "539": "Chord_F_sus4", + "540": "Chord_F#_sus4", + "541": "Chord_G_sus4", + "542": "Chord_G#_sus4", + "543": "Chord_N_N" +} \ No newline at end of file diff --git a/vocab/vocab_SymphonyNet_Dataset/vocab_SymphonyNet_Dataset_nb8.json b/vocab/vocab_SymphonyNet_Dataset/vocab_SymphonyNet_Dataset_nb8.json new file mode 100644 index 0000000..6b4d0c3 --- /dev/null +++ b/vocab/vocab_SymphonyNet_Dataset/vocab_SymphonyNet_Dataset_nb8.json @@ -0,0 +1,494 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_10", + "8": "Instrument_11", + "9": "Instrument_12", + "10": "Instrument_13", + "11": "Instrument_14", + "12": "Instrument_15", + "13": "Instrument_16", + "14": "Instrument_19", + "15": "Instrument_21", + "16": "Instrument_22", + "17": "Instrument_23", + "18": "Instrument_24", + "19": "Instrument_25", + "20": "Instrument_26", + "21": "Instrument_32", + "22": "Instrument_33", + "23": "Instrument_36", + "24": "Instrument_38", + "25": "Instrument_40", + "26": "Instrument_41", + "27": "Instrument_42", + "28": "Instrument_43", + "29": "Instrument_46", + "30": "Instrument_47", + "31": "Instrument_49", + "32": "Instrument_50", + "33": "Instrument_52", + "34": "Instrument_55", + "35": "Instrument_56", + "36": "Instrument_57", + "37": "Instrument_58", + "38": "Instrument_60", + "39": "Instrument_61", + "40": "Instrument_62", + "41": "Instrument_64", + "42": "Instrument_65", + "43": "Instrument_66", + "44": "Instrument_67", + "45": "Instrument_68", + "46": "Instrument_69", + "47": "Instrument_70", + "48": "Instrument_71", + "49": "Instrument_72", + "50": "Instrument_73", + "51": "Instrument_74", + "52": "Instrument_75", + "53": "Instrument_79", + "54": "Instrument_80", + "55": "Instrument_88", + "56": "Instrument_104", + "57": "Instrument_105", + "58": "Instrument_106", + "59": "Instrument_107", + "60": "Instrument_108", + "61": "Instrument_109", + "62": "Instrument_111", + "63": "Instrument_114", + "64": "Instrument_117", + "65": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8.json b/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8.json new file mode 100644 index 0000000..9f49c7e --- /dev/null +++ b/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8.json @@ -0,0 +1,557 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_1", + "3": "Instrument_2", + "4": "Instrument_3", + "5": "Instrument_4", + "6": "Instrument_5", + "7": "Instrument_6", + "8": "Instrument_7", + "9": "Instrument_8", + "10": "Instrument_9", + "11": "Instrument_10", + "12": "Instrument_11", + "13": "Instrument_12", + "14": "Instrument_13", + "15": "Instrument_14", + "16": "Instrument_15", + "17": "Instrument_16", + "18": "Instrument_17", + "19": "Instrument_18", + "20": "Instrument_19", + "21": "Instrument_20", + "22": "Instrument_21", + "23": "Instrument_22", + "24": "Instrument_23", + "25": "Instrument_24", + "26": "Instrument_25", + "27": "Instrument_26", + "28": "Instrument_27", + "29": "Instrument_28", + "30": "Instrument_29", + "31": "Instrument_30", + "32": "Instrument_31", + "33": "Instrument_32", + "34": "Instrument_33", + "35": "Instrument_34", + "36": "Instrument_35", + "37": "Instrument_36", + "38": "Instrument_37", + "39": "Instrument_38", + "40": "Instrument_39", + "41": "Instrument_40", + "42": "Instrument_41", + "43": "Instrument_42", + "44": "Instrument_43", + "45": "Instrument_44", + "46": "Instrument_45", + "47": "Instrument_46", + "48": "Instrument_47", + "49": "Instrument_48", + "50": "Instrument_49", + "51": "Instrument_50", + "52": "Instrument_51", + "53": "Instrument_52", + "54": "Instrument_53", + "55": "Instrument_54", + "56": "Instrument_55", + "57": "Instrument_56", + "58": "Instrument_57", + "59": "Instrument_58", + "60": "Instrument_59", + "61": "Instrument_60", + "62": "Instrument_61", + "63": "Instrument_62", + "64": "Instrument_63", + "65": "Instrument_64", + "66": "Instrument_65", + "67": "Instrument_66", + "68": "Instrument_67", + "69": "Instrument_68", + "70": "Instrument_69", + "71": "Instrument_70", + "72": "Instrument_71", + "73": "Instrument_72", + "74": "Instrument_73", + "75": "Instrument_74", + "76": "Instrument_75", + "77": "Instrument_76", + "78": "Instrument_77", + "79": "Instrument_78", + "80": "Instrument_79", + "81": "Instrument_80", + "82": "Instrument_81", + "83": "Instrument_82", + "84": "Instrument_83", + "85": "Instrument_84", + "86": "Instrument_85", + "87": "Instrument_86", + "88": "Instrument_87", + "89": "Instrument_88", + "90": "Instrument_89", + "91": "Instrument_90", + "92": "Instrument_91", + "93": "Instrument_92", + "94": "Instrument_93", + "95": "Instrument_94", + "96": "Instrument_95", + "97": "Instrument_96", + "98": "Instrument_97", + "99": "Instrument_98", + "100": "Instrument_99", + "101": "Instrument_100", + "102": "Instrument_101", + "103": "Instrument_102", + "104": "Instrument_103", + "105": "Instrument_104", + "106": "Instrument_105", + "107": "Instrument_106", + "108": "Instrument_107", + "109": "Instrument_108", + "110": "Instrument_109", + "111": "Instrument_110", + "112": "Instrument_111", + "113": "Instrument_112", + "114": "Instrument_113", + "115": "Instrument_114", + "116": "Instrument_115", + "117": "Instrument_116", + "118": "Instrument_117", + "119": "Instrument_118", + "120": "Instrument_119", + "121": "Instrument_120", + "122": "Instrument_121", + "123": "Instrument_122", + "124": "Instrument_123", + "125": "Instrument_124", + "126": "Instrument_125", + "127": "Instrument_126", + "128": "Instrument_127" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8_old.json b/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8_old.json new file mode 100644 index 0000000..c3a2fac --- /dev/null +++ b/vocab/vocab_XMIDI_Dataset/vocab_XMIDI_Dataset_nb8_old.json @@ -0,0 +1,378 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_4/4" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_32", + "2": "Tempo_35", + "3": "Tempo_38", + "4": "Tempo_40", + "5": "Tempo_44", + "6": "Tempo_46", + "7": "Tempo_50", + "8": "Tempo_54", + "9": "Tempo_56", + "10": "Tempo_60", + "11": "Tempo_62", + "12": "Tempo_64", + "13": "Tempo_67", + "14": "Tempo_70", + "15": "Tempo_73", + "16": "Tempo_76", + "17": "Tempo_79", + "18": "Tempo_82", + "19": "Tempo_85", + "20": "Tempo_88", + "21": "Tempo_92", + "22": "Tempo_96", + "23": "Tempo_100", + "24": "Tempo_104", + "25": "Tempo_108", + "26": "Tempo_112", + "27": "Tempo_116", + "28": "Tempo_121", + "29": "Tempo_126", + "30": "Tempo_131", + "31": "Tempo_136", + "32": "Tempo_141", + "33": "Tempo_147", + "34": "Tempo_153", + "35": "Tempo_159", + "36": "Tempo_165", + "37": "Tempo_172", + "38": "Tempo_179", + "39": "Tempo_186", + "40": "Tempo_193", + "41": "Tempo_201", + "42": "Tempo_209", + "43": "Tempo_217", + "44": "Tempo_226" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_12", + "3": "Instrument_16", + "4": "Instrument_25", + "5": "Instrument_32", + "6": "Instrument_40", + "7": "Instrument_46", + "8": "Instrument_48", + "9": "Instrument_56", + "10": "Instrument_58", + "11": "Instrument_66", + "12": "Instrument_73", + "13": "Instrument_80", + "14": "Instrument_88", + "15": "Instrument_104", + "16": "Instrument_107", + "17": "Instrument_114" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_aria-midi/oldvocab_aria-midi_nb8.json b/vocab/vocab_aria-midi/oldvocab_aria-midi_nb8.json new file mode 100644 index 0000000..085da32 --- /dev/null +++ b/vocab/vocab_aria-midi/oldvocab_aria-midi_nb8.json @@ -0,0 +1,311 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_4/4" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_121" + }, + "instrument": { + "0": 0, + "1": "Instrument_0" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_11", + "2": "Note_Pitch_12", + "3": "Note_Pitch_13", + "4": "Note_Pitch_14", + "5": "Note_Pitch_15", + "6": "Note_Pitch_16", + "7": "Note_Pitch_17", + "8": "Note_Pitch_18", + "9": "Note_Pitch_19", + "10": "Note_Pitch_20", + "11": "Note_Pitch_21", + "12": "Note_Pitch_22", + "13": "Note_Pitch_23", + "14": "Note_Pitch_24", + "15": "Note_Pitch_25", + "16": "Note_Pitch_26", + "17": "Note_Pitch_27", + "18": "Note_Pitch_28", + "19": "Note_Pitch_29", + "20": "Note_Pitch_30", + "21": "Note_Pitch_31", + "22": "Note_Pitch_32", + "23": "Note_Pitch_33", + "24": "Note_Pitch_34", + "25": "Note_Pitch_35", + "26": "Note_Pitch_36", + "27": "Note_Pitch_37", + "28": "Note_Pitch_38", + "29": "Note_Pitch_39", + "30": "Note_Pitch_40", + "31": "Note_Pitch_41", + "32": "Note_Pitch_42", + "33": "Note_Pitch_43", + "34": "Note_Pitch_44", + "35": "Note_Pitch_45", + "36": "Note_Pitch_46", + "37": "Note_Pitch_47", + "38": "Note_Pitch_48", + "39": "Note_Pitch_49", + "40": "Note_Pitch_50", + "41": "Note_Pitch_51", + "42": "Note_Pitch_52", + "43": "Note_Pitch_53", + "44": "Note_Pitch_54", + "45": "Note_Pitch_55", + "46": "Note_Pitch_56", + "47": "Note_Pitch_57", + "48": "Note_Pitch_58", + "49": "Note_Pitch_59", + "50": "Note_Pitch_60", + "51": "Note_Pitch_61", + "52": "Note_Pitch_62", + "53": "Note_Pitch_63", + "54": "Note_Pitch_64", + "55": "Note_Pitch_65", + "56": "Note_Pitch_66", + "57": "Note_Pitch_67", + "58": "Note_Pitch_68", + "59": "Note_Pitch_69", + "60": "Note_Pitch_70", + "61": "Note_Pitch_71", + "62": "Note_Pitch_72", + "63": "Note_Pitch_73", + "64": "Note_Pitch_74", + "65": "Note_Pitch_75", + "66": "Note_Pitch_76", + "67": "Note_Pitch_77", + "68": "Note_Pitch_78", + "69": "Note_Pitch_79", + "70": "Note_Pitch_80", + "71": "Note_Pitch_81", + "72": "Note_Pitch_82", + "73": "Note_Pitch_83", + "74": "Note_Pitch_84", + "75": "Note_Pitch_85", + "76": "Note_Pitch_86", + "77": "Note_Pitch_87", + "78": "Note_Pitch_88", + "79": "Note_Pitch_89", + "80": "Note_Pitch_90", + "81": "Note_Pitch_91", + "82": "Note_Pitch_92", + "83": "Note_Pitch_93", + "84": "Note_Pitch_94", + "85": "Note_Pitch_95", + "86": "Note_Pitch_96", + "87": "Note_Pitch_97", + "88": "Note_Pitch_98", + "89": "Note_Pitch_99", + "90": "Note_Pitch_100", + "91": "Note_Pitch_101", + "92": "Note_Pitch_102", + "93": "Note_Pitch_103", + "94": "Note_Pitch_104", + "95": "Note_Pitch_105", + "96": "Note_Pitch_106", + "97": "Note_Pitch_107", + "98": "Note_Pitch_108", + "99": "Note_Pitch_109", + "100": "Note_Pitch_110", + "101": "Note_Pitch_111", + "102": "Note_Pitch_112", + "103": "Note_Pitch_113", + "104": "Note_Pitch_114", + "105": "Note_Pitch_115", + "106": "Note_Pitch_116", + "107": "Note_Pitch_117", + "108": "Note_Pitch_118", + "109": "Note_Pitch_119" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_aria-midi/vocab_aria-midi_nb8.json b/vocab/vocab_aria-midi/vocab_aria-midi_nb8.json new file mode 100644 index 0000000..9f49c7e --- /dev/null +++ b/vocab/vocab_aria-midi/vocab_aria-midi_nb8.json @@ -0,0 +1,557 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_1", + "3": "Instrument_2", + "4": "Instrument_3", + "5": "Instrument_4", + "6": "Instrument_5", + "7": "Instrument_6", + "8": "Instrument_7", + "9": "Instrument_8", + "10": "Instrument_9", + "11": "Instrument_10", + "12": "Instrument_11", + "13": "Instrument_12", + "14": "Instrument_13", + "15": "Instrument_14", + "16": "Instrument_15", + "17": "Instrument_16", + "18": "Instrument_17", + "19": "Instrument_18", + "20": "Instrument_19", + "21": "Instrument_20", + "22": "Instrument_21", + "23": "Instrument_22", + "24": "Instrument_23", + "25": "Instrument_24", + "26": "Instrument_25", + "27": "Instrument_26", + "28": "Instrument_27", + "29": "Instrument_28", + "30": "Instrument_29", + "31": "Instrument_30", + "32": "Instrument_31", + "33": "Instrument_32", + "34": "Instrument_33", + "35": "Instrument_34", + "36": "Instrument_35", + "37": "Instrument_36", + "38": "Instrument_37", + "39": "Instrument_38", + "40": "Instrument_39", + "41": "Instrument_40", + "42": "Instrument_41", + "43": "Instrument_42", + "44": "Instrument_43", + "45": "Instrument_44", + "46": "Instrument_45", + "47": "Instrument_46", + "48": "Instrument_47", + "49": "Instrument_48", + "50": "Instrument_49", + "51": "Instrument_50", + "52": "Instrument_51", + "53": "Instrument_52", + "54": "Instrument_53", + "55": "Instrument_54", + "56": "Instrument_55", + "57": "Instrument_56", + "58": "Instrument_57", + "59": "Instrument_58", + "60": "Instrument_59", + "61": "Instrument_60", + "62": "Instrument_61", + "63": "Instrument_62", + "64": "Instrument_63", + "65": "Instrument_64", + "66": "Instrument_65", + "67": "Instrument_66", + "68": "Instrument_67", + "69": "Instrument_68", + "70": "Instrument_69", + "71": "Instrument_70", + "72": "Instrument_71", + "73": "Instrument_72", + "74": "Instrument_73", + "75": "Instrument_74", + "76": "Instrument_75", + "77": "Instrument_76", + "78": "Instrument_77", + "79": "Instrument_78", + "80": "Instrument_79", + "81": "Instrument_80", + "82": "Instrument_81", + "83": "Instrument_82", + "84": "Instrument_83", + "85": "Instrument_84", + "86": "Instrument_85", + "87": "Instrument_86", + "88": "Instrument_87", + "89": "Instrument_88", + "90": "Instrument_89", + "91": "Instrument_90", + "92": "Instrument_91", + "93": "Instrument_92", + "94": "Instrument_93", + "95": "Instrument_94", + "96": "Instrument_95", + "97": "Instrument_96", + "98": "Instrument_97", + "99": "Instrument_98", + "100": "Instrument_99", + "101": "Instrument_100", + "102": "Instrument_101", + "103": "Instrument_102", + "104": "Instrument_103", + "105": "Instrument_104", + "106": "Instrument_105", + "107": "Instrument_106", + "108": "Instrument_107", + "109": "Instrument_108", + "110": "Instrument_109", + "111": "Instrument_110", + "112": "Instrument_111", + "113": "Instrument_112", + "114": "Instrument_113", + "115": "Instrument_114", + "116": "Instrument_115", + "117": "Instrument_116", + "118": "Instrument_117", + "119": "Instrument_118", + "120": "Instrument_119", + "121": "Instrument_120", + "122": "Instrument_121", + "123": "Instrument_122", + "124": "Instrument_123", + "125": "Instrument_124", + "126": "Instrument_125", + "127": "Instrument_126", + "128": "Instrument_127" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_gigamidi/oldvocab_gigamidi_nb8.json b/vocab/vocab_gigamidi/oldvocab_gigamidi_nb8.json new file mode 100644 index 0000000..6b4d0c3 --- /dev/null +++ b/vocab/vocab_gigamidi/oldvocab_gigamidi_nb8.json @@ -0,0 +1,494 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_10", + "8": "Instrument_11", + "9": "Instrument_12", + "10": "Instrument_13", + "11": "Instrument_14", + "12": "Instrument_15", + "13": "Instrument_16", + "14": "Instrument_19", + "15": "Instrument_21", + "16": "Instrument_22", + "17": "Instrument_23", + "18": "Instrument_24", + "19": "Instrument_25", + "20": "Instrument_26", + "21": "Instrument_32", + "22": "Instrument_33", + "23": "Instrument_36", + "24": "Instrument_38", + "25": "Instrument_40", + "26": "Instrument_41", + "27": "Instrument_42", + "28": "Instrument_43", + "29": "Instrument_46", + "30": "Instrument_47", + "31": "Instrument_49", + "32": "Instrument_50", + "33": "Instrument_52", + "34": "Instrument_55", + "35": "Instrument_56", + "36": "Instrument_57", + "37": "Instrument_58", + "38": "Instrument_60", + "39": "Instrument_61", + "40": "Instrument_62", + "41": "Instrument_64", + "42": "Instrument_65", + "43": "Instrument_66", + "44": "Instrument_67", + "45": "Instrument_68", + "46": "Instrument_69", + "47": "Instrument_70", + "48": "Instrument_71", + "49": "Instrument_72", + "50": "Instrument_73", + "51": "Instrument_74", + "52": "Instrument_75", + "53": "Instrument_79", + "54": "Instrument_80", + "55": "Instrument_88", + "56": "Instrument_104", + "57": "Instrument_105", + "58": "Instrument_106", + "59": "Instrument_107", + "60": "Instrument_108", + "61": "Instrument_109", + "62": "Instrument_111", + "63": "Instrument_114", + "64": "Instrument_117", + "65": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_gigamidi/vocab_gigamidi_nb8.json b/vocab/vocab_gigamidi/vocab_gigamidi_nb8.json new file mode 100644 index 0000000..9f49c7e --- /dev/null +++ b/vocab/vocab_gigamidi/vocab_gigamidi_nb8.json @@ -0,0 +1,557 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_1", + "3": "Instrument_2", + "4": "Instrument_3", + "5": "Instrument_4", + "6": "Instrument_5", + "7": "Instrument_6", + "8": "Instrument_7", + "9": "Instrument_8", + "10": "Instrument_9", + "11": "Instrument_10", + "12": "Instrument_11", + "13": "Instrument_12", + "14": "Instrument_13", + "15": "Instrument_14", + "16": "Instrument_15", + "17": "Instrument_16", + "18": "Instrument_17", + "19": "Instrument_18", + "20": "Instrument_19", + "21": "Instrument_20", + "22": "Instrument_21", + "23": "Instrument_22", + "24": "Instrument_23", + "25": "Instrument_24", + "26": "Instrument_25", + "27": "Instrument_26", + "28": "Instrument_27", + "29": "Instrument_28", + "30": "Instrument_29", + "31": "Instrument_30", + "32": "Instrument_31", + "33": "Instrument_32", + "34": "Instrument_33", + "35": "Instrument_34", + "36": "Instrument_35", + "37": "Instrument_36", + "38": "Instrument_37", + "39": "Instrument_38", + "40": "Instrument_39", + "41": "Instrument_40", + "42": "Instrument_41", + "43": "Instrument_42", + "44": "Instrument_43", + "45": "Instrument_44", + "46": "Instrument_45", + "47": "Instrument_46", + "48": "Instrument_47", + "49": "Instrument_48", + "50": "Instrument_49", + "51": "Instrument_50", + "52": "Instrument_51", + "53": "Instrument_52", + "54": "Instrument_53", + "55": "Instrument_54", + "56": "Instrument_55", + "57": "Instrument_56", + "58": "Instrument_57", + "59": "Instrument_58", + "60": "Instrument_59", + "61": "Instrument_60", + "62": "Instrument_61", + "63": "Instrument_62", + "64": "Instrument_63", + "65": "Instrument_64", + "66": "Instrument_65", + "67": "Instrument_66", + "68": "Instrument_67", + "69": "Instrument_68", + "70": "Instrument_69", + "71": "Instrument_70", + "72": "Instrument_71", + "73": "Instrument_72", + "74": "Instrument_73", + "75": "Instrument_74", + "76": "Instrument_75", + "77": "Instrument_76", + "78": "Instrument_77", + "79": "Instrument_78", + "80": "Instrument_79", + "81": "Instrument_80", + "82": "Instrument_81", + "83": "Instrument_82", + "84": "Instrument_83", + "85": "Instrument_84", + "86": "Instrument_85", + "87": "Instrument_86", + "88": "Instrument_87", + "89": "Instrument_88", + "90": "Instrument_89", + "91": "Instrument_90", + "92": "Instrument_91", + "93": "Instrument_92", + "94": "Instrument_93", + "95": "Instrument_94", + "96": "Instrument_95", + "97": "Instrument_96", + "98": "Instrument_97", + "99": "Instrument_98", + "100": "Instrument_99", + "101": "Instrument_100", + "102": "Instrument_101", + "103": "Instrument_102", + "104": "Instrument_103", + "105": "Instrument_104", + "106": "Instrument_105", + "107": "Instrument_106", + "108": "Instrument_107", + "109": "Instrument_108", + "110": "Instrument_109", + "111": "Instrument_110", + "112": "Instrument_111", + "113": "Instrument_112", + "114": "Instrument_113", + "115": "Instrument_114", + "116": "Instrument_115", + "117": "Instrument_116", + "118": "Instrument_117", + "119": "Instrument_118", + "120": "Instrument_119", + "121": "Instrument_120", + "122": "Instrument_121", + "123": "Instrument_122", + "124": "Instrument_123", + "125": "Instrument_124", + "126": "Instrument_125", + "127": "Instrument_126", + "128": "Instrument_127" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_new_dataset/vocab_new_dataset_nb8.json b/vocab/vocab_new_dataset/vocab_new_dataset_nb8.json new file mode 100644 index 0000000..9f49c7e --- /dev/null +++ b/vocab/vocab_new_dataset/vocab_new_dataset_nb8.json @@ -0,0 +1,557 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_N_N", + "2": "Tempo_30", + "3": "Tempo_31", + "4": "Tempo_32", + "5": "Tempo_33", + "6": "Tempo_34", + "7": "Tempo_35", + "8": "Tempo_36", + "9": "Tempo_37", + "10": "Tempo_38", + "11": "Tempo_40", + "12": "Tempo_42", + "13": "Tempo_44", + "14": "Tempo_46", + "15": "Tempo_48", + "16": "Tempo_50", + "17": "Tempo_52", + "18": "Tempo_54", + "19": "Tempo_56", + "20": "Tempo_58", + "21": "Tempo_60", + "22": "Tempo_62", + "23": "Tempo_64", + "24": "Tempo_67", + "25": "Tempo_70", + "26": "Tempo_73", + "27": "Tempo_76", + "28": "Tempo_79", + "29": "Tempo_82", + "30": "Tempo_85", + "31": "Tempo_88", + "32": "Tempo_92", + "33": "Tempo_96", + "34": "Tempo_100", + "35": "Tempo_104", + "36": "Tempo_108", + "37": "Tempo_112", + "38": "Tempo_116", + "39": "Tempo_121", + "40": "Tempo_126", + "41": "Tempo_131", + "42": "Tempo_136", + "43": "Tempo_141", + "44": "Tempo_147", + "45": "Tempo_153", + "46": "Tempo_159", + "47": "Tempo_165", + "48": "Tempo_172", + "49": "Tempo_179", + "50": "Tempo_186", + "51": "Tempo_193", + "52": "Tempo_201", + "53": "Tempo_209", + "54": "Tempo_217", + "55": "Tempo_226", + "56": "Tempo_235", + "57": "Tempo_244", + "58": "Tempo_254", + "59": "Tempo_264", + "60": "Tempo_275", + "61": "Tempo_286", + "62": "Tempo_297", + "63": "Tempo_309", + "64": "Tempo_321", + "65": "Tempo_334", + "66": "Tempo_347", + "67": "Tempo_361", + "68": "Tempo_375", + "69": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_1", + "3": "Instrument_2", + "4": "Instrument_3", + "5": "Instrument_4", + "6": "Instrument_5", + "7": "Instrument_6", + "8": "Instrument_7", + "9": "Instrument_8", + "10": "Instrument_9", + "11": "Instrument_10", + "12": "Instrument_11", + "13": "Instrument_12", + "14": "Instrument_13", + "15": "Instrument_14", + "16": "Instrument_15", + "17": "Instrument_16", + "18": "Instrument_17", + "19": "Instrument_18", + "20": "Instrument_19", + "21": "Instrument_20", + "22": "Instrument_21", + "23": "Instrument_22", + "24": "Instrument_23", + "25": "Instrument_24", + "26": "Instrument_25", + "27": "Instrument_26", + "28": "Instrument_27", + "29": "Instrument_28", + "30": "Instrument_29", + "31": "Instrument_30", + "32": "Instrument_31", + "33": "Instrument_32", + "34": "Instrument_33", + "35": "Instrument_34", + "36": "Instrument_35", + "37": "Instrument_36", + "38": "Instrument_37", + "39": "Instrument_38", + "40": "Instrument_39", + "41": "Instrument_40", + "42": "Instrument_41", + "43": "Instrument_42", + "44": "Instrument_43", + "45": "Instrument_44", + "46": "Instrument_45", + "47": "Instrument_46", + "48": "Instrument_47", + "49": "Instrument_48", + "50": "Instrument_49", + "51": "Instrument_50", + "52": "Instrument_51", + "53": "Instrument_52", + "54": "Instrument_53", + "55": "Instrument_54", + "56": "Instrument_55", + "57": "Instrument_56", + "58": "Instrument_57", + "59": "Instrument_58", + "60": "Instrument_59", + "61": "Instrument_60", + "62": "Instrument_61", + "63": "Instrument_62", + "64": "Instrument_63", + "65": "Instrument_64", + "66": "Instrument_65", + "67": "Instrument_66", + "68": "Instrument_67", + "69": "Instrument_68", + "70": "Instrument_69", + "71": "Instrument_70", + "72": "Instrument_71", + "73": "Instrument_72", + "74": "Instrument_73", + "75": "Instrument_74", + "76": "Instrument_75", + "77": "Instrument_76", + "78": "Instrument_77", + "79": "Instrument_78", + "80": "Instrument_79", + "81": "Instrument_80", + "82": "Instrument_81", + "83": "Instrument_82", + "84": "Instrument_83", + "85": "Instrument_84", + "86": "Instrument_85", + "87": "Instrument_86", + "88": "Instrument_87", + "89": "Instrument_88", + "90": "Instrument_89", + "91": "Instrument_90", + "92": "Instrument_91", + "93": "Instrument_92", + "94": "Instrument_93", + "95": "Instrument_94", + "96": "Instrument_95", + "97": "Instrument_96", + "98": "Instrument_97", + "99": "Instrument_98", + "100": "Instrument_99", + "101": "Instrument_100", + "102": "Instrument_101", + "103": "Instrument_102", + "104": "Instrument_103", + "105": "Instrument_104", + "106": "Instrument_105", + "107": "Instrument_106", + "108": "Instrument_107", + "109": "Instrument_108", + "110": "Instrument_109", + "111": "Instrument_110", + "112": "Instrument_111", + "113": "Instrument_112", + "114": "Instrument_113", + "115": "Instrument_114", + "116": "Instrument_115", + "117": "Instrument_116", + "118": "Instrument_117", + "119": "Instrument_118", + "120": "Instrument_119", + "121": "Instrument_120", + "122": "Instrument_121", + "123": "Instrument_122", + "124": "Instrument_123", + "125": "Instrument_124", + "126": "Instrument_125", + "127": "Instrument_126", + "128": "Instrument_127" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file diff --git a/vocab/vocab_new_dataset/vocab_new_dataset_nb8_old.json b/vocab/vocab_new_dataset/vocab_new_dataset_nb8_old.json new file mode 100644 index 0000000..2a1adbb --- /dev/null +++ b/vocab/vocab_new_dataset/vocab_new_dataset_nb8_old.json @@ -0,0 +1,493 @@ +{ + "type": { + "0": "SOS", + "1": "EOS", + "2": "Empty_Bar", + "3": "SSS", + "4": "SSN", + "5": "SNN", + "6": "NNN_time_signature_12/8", + "7": "NNN_time_signature_11/8", + "8": "NNN_time_signature_9/8", + "9": "NNN_time_signature_8/8", + "10": "NNN_time_signature_7/8", + "11": "NNN_time_signature_6/8", + "12": "NNN_time_signature_5/8", + "13": "NNN_time_signature_4/8", + "14": "NNN_time_signature_3/8", + "15": "NNN_time_signature_2/8", + "16": "NNN_time_signature_1/8", + "17": "NNN_time_signature_8/4", + "18": "NNN_time_signature_7/4", + "19": "NNN_time_signature_6/4", + "20": "NNN_time_signature_5/4", + "21": "NNN_time_signature_4/4", + "22": "NNN_time_signature_3/4", + "23": "NNN_time_signature_2/4", + "24": "NNN_time_signature_1/4", + "25": "NNN_time_signature_4/2", + "26": "NNN_time_signature_3/2", + "27": "NNN_time_signature_2/2", + "28": "NNN_time_signature_1/2", + "29": "NNN_time_signature_1/1" + }, + "beat": { + "0": 0, + "1": "Beat_0", + "2": "Beat_1", + "3": "Beat_2", + "4": "Beat_3", + "5": "Beat_4", + "6": "Beat_5", + "7": "Beat_6", + "8": "Beat_7", + "9": "Beat_8", + "10": "Beat_9", + "11": "Beat_10", + "12": "Beat_11", + "13": "Beat_12", + "14": "Beat_13", + "15": "Beat_14", + "16": "Beat_15", + "17": "Beat_16", + "18": "Beat_17", + "19": "Beat_18", + "20": "Beat_19", + "21": "Beat_20", + "22": "Beat_21", + "23": "Beat_22", + "24": "Beat_23", + "25": "Beat_24", + "26": "Beat_25", + "27": "Beat_26", + "28": "Beat_27", + "29": "Beat_28", + "30": "Beat_29", + "31": "Beat_30", + "32": "Beat_31" + }, + "chord": { + "0": 0, + "1": "Chord_N_N", + "2": "Chord_A_+", + "3": "Chord_A#_+", + "4": "Chord_B_+", + "5": "Chord_C_+", + "6": "Chord_C#_+", + "7": "Chord_D_+", + "8": "Chord_D#_+", + "9": "Chord_E_+", + "10": "Chord_F_+", + "11": "Chord_F#_+", + "12": "Chord_G_+", + "13": "Chord_G#_+", + "14": "Chord_A_/o7", + "15": "Chord_A#_/o7", + "16": "Chord_B_/o7", + "17": "Chord_C_/o7", + "18": "Chord_C#_/o7", + "19": "Chord_D_/o7", + "20": "Chord_D#_/o7", + "21": "Chord_E_/o7", + "22": "Chord_F_/o7", + "23": "Chord_F#_/o7", + "24": "Chord_G_/o7", + "25": "Chord_G#_/o7", + "26": "Chord_A_7", + "27": "Chord_A#_7", + "28": "Chord_B_7", + "29": "Chord_C_7", + "30": "Chord_C#_7", + "31": "Chord_D_7", + "32": "Chord_D#_7", + "33": "Chord_E_7", + "34": "Chord_F_7", + "35": "Chord_F#_7", + "36": "Chord_G_7", + "37": "Chord_G#_7", + "38": "Chord_A_M", + "39": "Chord_A#_M", + "40": "Chord_B_M", + "41": "Chord_C_M", + "42": "Chord_C#_M", + "43": "Chord_D_M", + "44": "Chord_D#_M", + "45": "Chord_E_M", + "46": "Chord_F_M", + "47": "Chord_F#_M", + "48": "Chord_G_M", + "49": "Chord_G#_M", + "50": "Chord_A_M7", + "51": "Chord_A#_M7", + "52": "Chord_B_M7", + "53": "Chord_C_M7", + "54": "Chord_C#_M7", + "55": "Chord_D_M7", + "56": "Chord_D#_M7", + "57": "Chord_E_M7", + "58": "Chord_F_M7", + "59": "Chord_F#_M7", + "60": "Chord_G_M7", + "61": "Chord_G#_M7", + "62": "Chord_A_m", + "63": "Chord_A#_m", + "64": "Chord_B_m", + "65": "Chord_C_m", + "66": "Chord_C#_m", + "67": "Chord_D_m", + "68": "Chord_D#_m", + "69": "Chord_E_m", + "70": "Chord_F_m", + "71": "Chord_F#_m", + "72": "Chord_G_m", + "73": "Chord_G#_m", + "74": "Chord_A_m7", + "75": "Chord_A#_m7", + "76": "Chord_B_m7", + "77": "Chord_C_m7", + "78": "Chord_C#_m7", + "79": "Chord_D_m7", + "80": "Chord_D#_m7", + "81": "Chord_E_m7", + "82": "Chord_F_m7", + "83": "Chord_F#_m7", + "84": "Chord_G_m7", + "85": "Chord_G#_m7", + "86": "Chord_A_o", + "87": "Chord_A#_o", + "88": "Chord_B_o", + "89": "Chord_C_o", + "90": "Chord_C#_o", + "91": "Chord_D_o", + "92": "Chord_D#_o", + "93": "Chord_E_o", + "94": "Chord_F_o", + "95": "Chord_F#_o", + "96": "Chord_G_o", + "97": "Chord_G#_o", + "98": "Chord_A_o7", + "99": "Chord_A#_o7", + "100": "Chord_B_o7", + "101": "Chord_C_o7", + "102": "Chord_C#_o7", + "103": "Chord_D_o7", + "104": "Chord_D#_o7", + "105": "Chord_E_o7", + "106": "Chord_F_o7", + "107": "Chord_F#_o7", + "108": "Chord_G_o7", + "109": "Chord_G#_o7", + "110": "Chord_A_sus2", + "111": "Chord_A#_sus2", + "112": "Chord_B_sus2", + "113": "Chord_C_sus2", + "114": "Chord_C#_sus2", + "115": "Chord_D_sus2", + "116": "Chord_D#_sus2", + "117": "Chord_E_sus2", + "118": "Chord_F_sus2", + "119": "Chord_F#_sus2", + "120": "Chord_G_sus2", + "121": "Chord_G#_sus2", + "122": "Chord_A_sus4", + "123": "Chord_A#_sus4", + "124": "Chord_B_sus4", + "125": "Chord_C_sus4", + "126": "Chord_C#_sus4", + "127": "Chord_D_sus4", + "128": "Chord_D#_sus4", + "129": "Chord_E_sus4", + "130": "Chord_F_sus4", + "131": "Chord_F#_sus4", + "132": "Chord_G_sus4", + "133": "Chord_G#_sus4" + }, + "tempo": { + "0": 0, + "1": "Tempo_30", + "2": "Tempo_31", + "3": "Tempo_32", + "4": "Tempo_33", + "5": "Tempo_34", + "6": "Tempo_35", + "7": "Tempo_36", + "8": "Tempo_37", + "9": "Tempo_38", + "10": "Tempo_40", + "11": "Tempo_42", + "12": "Tempo_44", + "13": "Tempo_46", + "14": "Tempo_48", + "15": "Tempo_50", + "16": "Tempo_52", + "17": "Tempo_54", + "18": "Tempo_56", + "19": "Tempo_58", + "20": "Tempo_60", + "21": "Tempo_62", + "22": "Tempo_64", + "23": "Tempo_67", + "24": "Tempo_70", + "25": "Tempo_73", + "26": "Tempo_76", + "27": "Tempo_79", + "28": "Tempo_82", + "29": "Tempo_85", + "30": "Tempo_88", + "31": "Tempo_92", + "32": "Tempo_96", + "33": "Tempo_100", + "34": "Tempo_104", + "35": "Tempo_108", + "36": "Tempo_112", + "37": "Tempo_116", + "38": "Tempo_121", + "39": "Tempo_126", + "40": "Tempo_131", + "41": "Tempo_136", + "42": "Tempo_141", + "43": "Tempo_147", + "44": "Tempo_153", + "45": "Tempo_159", + "46": "Tempo_165", + "47": "Tempo_172", + "48": "Tempo_179", + "49": "Tempo_186", + "50": "Tempo_193", + "51": "Tempo_201", + "52": "Tempo_209", + "53": "Tempo_217", + "54": "Tempo_226", + "55": "Tempo_235", + "56": "Tempo_244", + "57": "Tempo_254", + "58": "Tempo_264", + "59": "Tempo_275", + "60": "Tempo_286", + "61": "Tempo_297", + "62": "Tempo_309", + "63": "Tempo_321", + "64": "Tempo_334", + "65": "Tempo_347", + "66": "Tempo_361", + "67": "Tempo_375", + "68": "Tempo_390" + }, + "instrument": { + "0": 0, + "1": "Instrument_0", + "2": "Instrument_4", + "3": "Instrument_6", + "4": "Instrument_7", + "5": "Instrument_8", + "6": "Instrument_9", + "7": "Instrument_10", + "8": "Instrument_11", + "9": "Instrument_12", + "10": "Instrument_13", + "11": "Instrument_14", + "12": "Instrument_15", + "13": "Instrument_16", + "14": "Instrument_19", + "15": "Instrument_21", + "16": "Instrument_22", + "17": "Instrument_23", + "18": "Instrument_24", + "19": "Instrument_25", + "20": "Instrument_26", + "21": "Instrument_32", + "22": "Instrument_33", + "23": "Instrument_36", + "24": "Instrument_38", + "25": "Instrument_40", + "26": "Instrument_41", + "27": "Instrument_42", + "28": "Instrument_43", + "29": "Instrument_46", + "30": "Instrument_47", + "31": "Instrument_49", + "32": "Instrument_50", + "33": "Instrument_52", + "34": "Instrument_55", + "35": "Instrument_56", + "36": "Instrument_57", + "37": "Instrument_58", + "38": "Instrument_60", + "39": "Instrument_61", + "40": "Instrument_62", + "41": "Instrument_64", + "42": "Instrument_65", + "43": "Instrument_66", + "44": "Instrument_67", + "45": "Instrument_68", + "46": "Instrument_69", + "47": "Instrument_70", + "48": "Instrument_71", + "49": "Instrument_72", + "50": "Instrument_73", + "51": "Instrument_74", + "52": "Instrument_75", + "53": "Instrument_79", + "54": "Instrument_80", + "55": "Instrument_88", + "56": "Instrument_104", + "57": "Instrument_105", + "58": "Instrument_106", + "59": "Instrument_107", + "60": "Instrument_108", + "61": "Instrument_109", + "62": "Instrument_111", + "63": "Instrument_114", + "64": "Instrument_117", + "65": "Instrument_118" + }, + "pitch": { + "0": 0, + "1": "Note_Pitch_6", + "2": "Note_Pitch_7", + "3": "Note_Pitch_8", + "4": "Note_Pitch_9", + "5": "Note_Pitch_10", + "6": "Note_Pitch_11", + "7": "Note_Pitch_12", + "8": "Note_Pitch_13", + "9": "Note_Pitch_14", + "10": "Note_Pitch_15", + "11": "Note_Pitch_16", + "12": "Note_Pitch_17", + "13": "Note_Pitch_18", + "14": "Note_Pitch_19", + "15": "Note_Pitch_20", + "16": "Note_Pitch_21", + "17": "Note_Pitch_22", + "18": "Note_Pitch_23", + "19": "Note_Pitch_24", + "20": "Note_Pitch_25", + "21": "Note_Pitch_26", + "22": "Note_Pitch_27", + "23": "Note_Pitch_28", + "24": "Note_Pitch_29", + "25": "Note_Pitch_30", + "26": "Note_Pitch_31", + "27": "Note_Pitch_32", + "28": "Note_Pitch_33", + "29": "Note_Pitch_34", + "30": "Note_Pitch_35", + "31": "Note_Pitch_36", + "32": "Note_Pitch_37", + "33": "Note_Pitch_38", + "34": "Note_Pitch_39", + "35": "Note_Pitch_40", + "36": "Note_Pitch_41", + "37": "Note_Pitch_42", + "38": "Note_Pitch_43", + "39": "Note_Pitch_44", + "40": "Note_Pitch_45", + "41": "Note_Pitch_46", + "42": "Note_Pitch_47", + "43": "Note_Pitch_48", + "44": "Note_Pitch_49", + "45": "Note_Pitch_50", + "46": "Note_Pitch_51", + "47": "Note_Pitch_52", + "48": "Note_Pitch_53", + "49": "Note_Pitch_54", + "50": "Note_Pitch_55", + "51": "Note_Pitch_56", + "52": "Note_Pitch_57", + "53": "Note_Pitch_58", + "54": "Note_Pitch_59", + "55": "Note_Pitch_60", + "56": "Note_Pitch_61", + "57": "Note_Pitch_62", + "58": "Note_Pitch_63", + "59": "Note_Pitch_64", + "60": "Note_Pitch_65", + "61": "Note_Pitch_66", + "62": "Note_Pitch_67", + "63": "Note_Pitch_68", + "64": "Note_Pitch_69", + "65": "Note_Pitch_70", + "66": "Note_Pitch_71", + "67": "Note_Pitch_72", + "68": "Note_Pitch_73", + "69": "Note_Pitch_74", + "70": "Note_Pitch_75", + "71": "Note_Pitch_76", + "72": "Note_Pitch_77", + "73": "Note_Pitch_78", + "74": "Note_Pitch_79", + "75": "Note_Pitch_80", + "76": "Note_Pitch_81", + "77": "Note_Pitch_82", + "78": "Note_Pitch_83", + "79": "Note_Pitch_84", + "80": "Note_Pitch_85", + "81": "Note_Pitch_86", + "82": "Note_Pitch_87", + "83": "Note_Pitch_88", + "84": "Note_Pitch_89", + "85": "Note_Pitch_90", + "86": "Note_Pitch_91", + "87": "Note_Pitch_92", + "88": "Note_Pitch_93", + "89": "Note_Pitch_94", + "90": "Note_Pitch_95", + "91": "Note_Pitch_96", + "92": "Note_Pitch_97", + "93": "Note_Pitch_98", + "94": "Note_Pitch_99", + "95": "Note_Pitch_100", + "96": "Note_Pitch_101", + "97": "Note_Pitch_102", + "98": "Note_Pitch_103", + "99": "Note_Pitch_104", + "100": "Note_Pitch_105", + "101": "Note_Pitch_106", + "102": "Note_Pitch_107", + "103": "Note_Pitch_108", + "104": "Note_Pitch_109", + "105": "Note_Pitch_110", + "106": "Note_Pitch_111", + "107": "Note_Pitch_112", + "108": "Note_Pitch_113", + "109": "Note_Pitch_114", + "110": "Note_Pitch_115", + "111": "Note_Pitch_116", + "112": "Note_Pitch_117", + "113": "Note_Pitch_118", + "114": "Note_Pitch_119", + "115": "Note_Pitch_120", + "116": "Note_Pitch_121", + "117": "Note_Pitch_122", + "118": "Note_Pitch_123", + "119": "Note_Pitch_124", + "120": "Note_Pitch_125", + "121": "Note_Pitch_126" + }, + "duration": { + "0": 0, + "1": "Note_Duration_1", + "2": "Note_Duration_2", + "3": "Note_Duration_3", + "4": "Note_Duration_4", + "5": "Note_Duration_5", + "6": "Note_Duration_6", + "7": "Note_Duration_8", + "8": "Note_Duration_10", + "9": "Note_Duration_12", + "10": "Note_Duration_16", + "11": "Note_Duration_20", + "12": "Note_Duration_24", + "13": "Note_Duration_28", + "14": "Note_Duration_32" + }, + "velocity": { + "0": 0, + "1": "Note_Velocity_40", + "2": "Note_Velocity_60", + "3": "Note_Velocity_80", + "4": "Note_Velocity_100", + "5": "Note_Velocity_120" + } +} \ No newline at end of file