first commit

This commit is contained in:
2025-09-08 14:49:28 +08:00
commit 80333dff74
160 changed files with 30655 additions and 0 deletions

View File

@ -0,0 +1,65 @@
defaults:
# - nn_params: nb8_embSum_NMT
# - nn_params: remi8
- nn_params: nb8_embSum_diff_t2m_150M_finetunning
# - nn_params: nb8_embSum_diff_t2m_150M_pretraining
# - nn_params: nb8_embSum_subPararell
# - nn_params: nb8_embSum_diff_t2m_150M
# - nn_params: nb8_embSum_subFeedForward
# - nn_params: nb8_embSum_diff
# nn_params: nb8_SA_diff
# - nn_params: nb8_embSum_diff_main12head16dim512_ave
# - nn_params: nb8_embSum_NMT_main12_head_16_dim512
# - nn_params: remi8_main12_head_16_dim512
# - nn_params: nb5_embSum_diff_main12head16dim768_sub3
dataset: FinetuneDataset # Pop1k7, Pop909, SOD, LakhClean,PretrainingDataset FinetuneDataset
captions_path: dataset/midicaps/train_set.json
# dataset: SymphonyNet_Dataset # Pop1k7, Pop909, SOD, LakhClean
# captions_path: dataset/symphonyNet/syd-caption.json
use_ddp: True # True, False | distributed data parallel
use_fp16: True # True, False | mixed precision training
use_diff: True # True,use diffusion in subdecoder
diff_steps: 8 # number of diffusion steps
use_dispLoss: True
lambda_weight: 0.5
tau: 0.5
train_params:
device: cuda
batch_size: 3
grad_clip: 1.0
num_iter: 300000 # total number of iterations
num_cycles_for_inference: 10 # number of cycles for inference, iterations_per_validation_cycle * num_cycles_for_inference
num_cycles_for_model_checkpoint: 1 # number of cycles for model checkpoint, iterations_per_validation_cycle * num_cycles_for_model_checkpoint
iterations_per_training_cycle: 10 # number of iterations for logging training loss
iterations_per_validation_cycle: 5000 # number of iterations for validation process
input_length: 3072 # input sequence length3072
# you can use focal loss, it it's not used, set focal_gamma to 0
focal_alpha: 1
focal_gamma: 0
# learning rate scheduler: 'cosinelr', 'cosineannealingwarmuprestarts', 'not-using', please check train_utils.py for more details
scheduler : cosinelr
initial_lr: 0.00005
decay_step_rate: 0.8 # means it will reach its lowest point at decay_step_rate * total_num_iter
num_steps_per_cycle: 20000 # number of steps per cycle for 'cosineannealingwarmuprestarts'
warmup_steps: 2000 #number of warmup steps
max_lr: 0.00015
gamma: 0.6 # the decay rate for 'cosineannealingwarmuprestarts'
# Distributed Data Parallel
world_size: 5 # 0 means no distributed training
gradient_accumulation_steps: 4 # 1 means no gradient accumulation
inference_params:
num_uncond_generation: 1 # number of unconditional generation
num_cond_generation: 3 # number of conditional generation
data_params:
first_pred_feature: pitch # compound shifting for NB only, choose the target sub-token (remi and cp are not influenced by this argument)
split_ratio: 0.998 # train-validation-test split ratio
aug_type: pitch # random, null | pitch and chord augmentation type
general:
debug: False
make_log: True # True, False | update the log file in wandb online to your designated project and entity
infer_and_log: True # True, False | inference and log the results

View File

@ -0,0 +1,54 @@
defaults:
# - nn_params: nb8_embSum_NMT
# - nn_params: remi8
# - nn_params: nb8_embSum_diff
- nn_params: nb8_embSum_subFeedForward
# - nn_params: nb8_SA_diff
# - nn_params: nb8_embSum_diff_main12head16dim512_ave
# - nn_params: nb8_embSum_NMT_main12_head_16_dim512
# - nn_params: remi8_main12_head_16_dim512
# - nn_params: nb5_embSum_diff_main12head16dim768_sub3
dataset: LakhClean # Pop1k7, Pop909, SOD, LakhClean
use_ddp: True # True, False | distributed data parallel
use_fp16: True # True, False | mixed precision training
use_diff: True # True,use diffusion in subdecoder
use_dispLoss: True
lambda_weight: 0.5
tau: 0.5
diff_steps: 8 # number of diffusion steps
train_params:
device: cuda
batch_size: 8
grad_clip: 1.0
num_iter: 25000 # total number of iterations
num_cycles_for_inference: 10 # number of cycles for inference, iterations_per_validation_cycle * num_cycles_for_inference
num_cycles_for_model_checkpoint: 10 # number of cycles for model checkpoint, iterations_per_validation_cycle * num_cycles_for_model_checkpoint
iterations_per_training_cycle: 10 # number of iterations for logging training loss
iterations_per_validation_cycle: 500 # number of iterations for validation process
input_length: 3072 # input sequence length3072
# you can use focal loss, it it's not used, set focal_gamma to 0
focal_alpha: 1
focal_gamma: 0
# learning rate scheduler: 'cosinelr', 'cosineannealingwarmuprestarts', 'not-using', please check train_utils.py for more details
scheduler : cosinelr
initial_lr: 0.0001
decay_step_rate: 0.4 # means it will reach its lowest point at decay_step_rate * total_num_iter
num_steps_per_cycle: 20000 # number of steps per cycle for 'cosineannealingwarmuprestarts'
warmup_steps: 2000 # number of warmup steps
max_lr: 0.00015
gamma: 0.6 # the decay rate for 'cosineannealingwarmuprestarts'
# Distributed Data Parallel
world_size: 5 # 0 means no distributed training
gradient_accumulation_steps: 1 # 1 means no gradient accumulation
inference_params:
num_uncond_generation: 1 # number of unconditional generation
num_cond_generation: 3 # number of conditional generation
data_params:
first_pred_feature: pitch # compound shifting for NB only, choose the target sub-token (remi and cp are not influenced by this argument)
split_ratio: 0.99 # train-validation-test split ratio
aug_type: null # random, null | pitch and chord augmentation type
general:
debug: False
make_log: True # True, False | update the log file in wandb online to your designated project and entity
infer_and_log: True # True, False | inference and log the results

View File

@ -0,0 +1,20 @@
encoding_scheme: cp
num_features: 5
vocab_name: MusicTokenVocabCP
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
input_length: 1024
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,20 @@
encoding_scheme: cp
num_features: 5
vocab_name: MusicTokenVocabCP
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
input_length: 1024
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,18 @@
encoding_scheme: cp
num_features: 5
vocab_name: MusicTokenVocabCP
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: FeedForward
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,19 @@
encoding_scheme: cp
num_features: 5
vocab_name: MusicTokenVocabCP
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: FeedForward
model_dropout: 0.1
partial_sequential_prediction: True
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,19 @@
encoding_scheme: cp
num_features: 7
vocab_name: MusicTokenVocabCP
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: cp
num_features: 7
vocab_name: MusicTokenVocabCP
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,18 @@
encoding_scheme: cp
num_features: 7
vocab_name: MusicTokenVocabCP
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: FeedForward
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,20 @@
encoding_scheme: cp
num_features: 7
vocab_name: MusicTokenVocabCP
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: FeedForward
model_dropout: 0.1
partial_sequential_prediction: True
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
input_length: 1024
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 12
num_head: 16
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 3
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 768
num_layer: 12
num_head: 16
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 3
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: FeedForward
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: Parallel
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: RNN
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 5
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: SelfAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 7
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 7
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 7
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: FeedForward
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 7
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: Parallel
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 7
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: RNN
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 7
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: SelfAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SelfAttentionEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 12
num_head: 16
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 12
num_head: 16
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 3
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: CrossAttention
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 6
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.2
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 768
num_layer: 16
num_head: 12
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,20 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 12
num_head: 16
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,20 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: AverageEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 12
num_head: 16
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 12
num_head: 16
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 3
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 2
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 6
feature_enricher_use: True

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerCrossAttendDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.2
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 768
num_layer: 16
num_head: 12
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerFinetuningDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.2
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 768
num_layer: 20
num_head: 12
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerPrefixDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 768
num_layer: 16
num_head: 12
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerPretrainingDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 768
num_layer: 20
num_head: 12
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,19 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerCrossAttendDecoder
sub_decoder_name: DiffusionDecoder
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1
feature_enricher_use: False

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: FeedForward
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,18 @@
encoding_scheme: nb
num_features: 8
vocab_name: MusicTokenVocabNB
model_name: NestedMusicTransformer
input_embedder_name: SummationEmbedder
main_decoder_name: XtransformerDecoder
sub_decoder_name: Parallel
model_dropout: 0.1
input_embedder:
num_layer: 1
num_head: 8
main_decoder:
dim_model: 512
num_layer: 6
num_head: 8
sub_decoder:
decout_window_size: 1 # 1 means no previous decoding output added
num_layer: 1

View File

@ -0,0 +1,12 @@
encoding_scheme: remi
num_features: 5
vocab_name: LangTokenVocab
model_name: NestedMusicTransformer
input_embedder_name: SingleEmbedding
main_decoder_name: XtransformerDecoder
sub_decoder_name: SingleProjection
model_dropout: 0.1
main_decoder:
dim_model: 512
num_layer: 8
num_head: 8

View File

@ -0,0 +1,12 @@
encoding_scheme: remi
num_features: 7
vocab_name: LangTokenVocab
model_name: NestedMusicTransformer
input_embedder_name: SingleEmbedding
main_decoder_name: XtransformerDecoder
sub_decoder_name: SingleProjection
model_dropout: 0.1
main_decoder:
dim_model: 512
num_layer: 8
num_head: 8

View File

@ -0,0 +1,12 @@
encoding_scheme: remi
num_features: 8
vocab_name: LangTokenVocab
model_name: NestedMusicTransformer
input_embedder_name: SingleEmbedding
main_decoder_name: XtransformerDecoder
sub_decoder_name: SingleProjection
model_dropout: 0.1
main_decoder:
dim_model: 512
num_layer: 8
num_head: 8

View File

@ -0,0 +1,12 @@
encoding_scheme: remi
num_features: 8
vocab_name: LangTokenVocab
model_name: NestedMusicTransformer
input_embedder_name: SingleEmbedding
main_decoder_name: XtransformerDecoder
sub_decoder_name: SingleProjection
model_dropout: 0.1
main_decoder:
dim_model: 512
num_layer: 12
num_head: 16

View File

@ -0,0 +1,17 @@
program: train.py
method: grid
metric:
name: valid.total
goal: minimize
parameters:
train_params.batch_size:
values: [8]
train_params.focal_gamma:
values: [0, 1]
nn_params.main_decoder.input_length:
values: [8192]
command:
- python3
- ${program}
- ${args_no_hyphens}