first commit

This commit is contained in:
2025-09-08 14:49:28 +08:00
commit 80333dff74
160 changed files with 30655 additions and 0 deletions

View File

@ -0,0 +1,127 @@
import argparse
import time
from pathlib import Path
import numpy as np
import pickle
from tqdm import tqdm
import vocab_utils
class Event2tuneidx():
def __init__(
self,
dataset: str,
encoding_scheme: str,
num_features: int,
in_dir: Path,
out_dir: Path,
debug: bool
):
self.dataset = dataset
self.encoding_scheme = encoding_scheme
self.encoding_name = encoding_scheme + str(num_features)
self.in_dir = in_dir / f"events_{self.dataset}" / self.encoding_name
self.out_dir = out_dir / f"tuneidx_{self.dataset}" / self.encoding_name
self.debug = debug
vocab_name = {'remi':'LangTokenVocab', 'cp':'MusicTokenVocabCP', 'nb':'MusicTokenVocabNB'}
selected_vocab_name = vocab_name[encoding_scheme]
in_vocab_file_path = Path(f"../vocab/vocab_{dataset}/vocab_{dataset}_{encoding_scheme}{num_features}.json")
self.vocab = getattr(vocab_utils, selected_vocab_name)(in_vocab_file_path=in_vocab_file_path, event_data=None,
encoding_scheme=encoding_scheme, num_features=num_features)
def _convert_event_to_tune_in_idx(self, tune_in_event):
tune_in_idx = []
for event in tune_in_event:
event_in_idx = self.vocab(event)
if event_in_idx != None:
tune_in_idx.append(event_in_idx)
return tune_in_idx
def _load_single_event_and_make_tune_in_idx(self, file_path):
with open(file_path, 'rb') as f:
tune_in_event = pickle.load(f)
tune_in_idx = self._convert_event_to_tune_in_idx(tune_in_event)
return file_path.name, tune_in_idx
def make_tune_in_idx(self):
print("preprocessing events data to tune_in_idx data")
# check output directory exists
self.out_dir.mkdir(parents=True, exist_ok=True)
start_time = time.time()
event_list = sorted(list(self.in_dir.rglob("*.pkl")))
if event_list == []:
event_list = sorted(list(self.in_dir.glob("*.pkli")))
for filepath_name, tune_in_idx in tqdm(map(self._load_single_event_and_make_tune_in_idx, event_list), total=len(event_list)):
# save tune_in_idx as npz file with uint16 dtype for remi because it has more than 256 tokens
if self.encoding_scheme == 'remi':
tune_in_idx = np.array(tune_in_idx, dtype=np.int16)
else:
tune_in_idx = np.array(tune_in_idx, dtype=np.int16)
if np.max(tune_in_idx) < 256:
tune_in_idx = np.array(tune_in_idx, dtype=np.uint8)
if filepath_name.endswith('.pkli'):
file_name = filepath_name.replace('.pkli', '.npz')
else:
file_name = filepath_name.replace('.pkl', '.npz')
np.savez_compressed(self.out_dir / file_name, tune_in_idx)
del tune_in_idx
print(f"taken time for making tune_in_idx is {time.time()-start_time}")
def get_argument_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
"-d",
"--dataset",
required=True,
# choices=("BachChorale", "Pop1k7", "Pop909", "SOD", "LakhClean", "SymphonyMIDI"),
type=str,
help="dataset names",
)
parser.add_argument(
"-e",
"--encoding",
required=True,
choices=("remi", "cp", "nb"),
type=str,
help="encoding scheme",
)
parser.add_argument(
"-f",
"--num_features",
required=True,
choices=(4, 5, 7, 8),
type=int,
help="number of features",
)
parser.add_argument(
"-i",
"--in_dir",
default="../dataset/represented_data/events/",
type=Path,
help="input data directory",
)
parser.add_argument(
"-o",
"--out_dir",
default="../dataset/represented_data/tuneidx/",
type=Path,
help="output data directory",
)
parser.add_argument(
"--debug",
action="store_true",
help="enable debug mode",
)
return parser
def main():
parser = get_argument_parser()
args = parser.parse_args()
event2tuneidx = Event2tuneidx(args.dataset, args.encoding, args.num_features, args.in_dir, args.out_dir, args.debug)
event2tuneidx.make_tune_in_idx()
if __name__ == "__main__":
main()