MIDIFoundationModel/SongEval/eval.py

import glob
import os
import json
import librosa
import numpy as np
import torch
import argparse
from muq import MuQ
from hydra.utils import instantiate
from omegaconf import OmegaConf
from safetensors.torch import load_file
from tqdm import tqdm


class Synthesizer(object):

    def __init__(self,
                 checkpoint_path,
                 input_path,
                 output_dir,
                 use_cpu: bool = False):

        self.checkpoint_path = checkpoint_path
        self.input_path = input_path
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)
        self.device = torch.device('cuda') if (torch.cuda.is_available() and (not use_cpu)) else torch.device('cpu')

    @torch.no_grad()
    def setup(self):

        train_config = OmegaConf.load(os.path.join(os.path.dirname(self.checkpoint_path), '../config.yaml'))
        model = instantiate(train_config.generator).to(self.device).eval()
        state_dict = load_file(self.checkpoint_path, device="cpu")
        model.load_state_dict(state_dict, strict=False)

        self.model = model
        self.muq = MuQ.from_pretrained("OpenMuQ/MuQ-large-msd-iter")
        self.muq = self.muq.to(self.device).eval()
        self.result_dcit = {}

    @torch.no_grad()
    def synthesis(self):
        if os.path.isfile(self.input_path):
            if self.input_path.endswith(('.wav', '.mp3')):
                lines = []
                lines.append(self.input_path)
            else:
                with open(self.input_path, "r") as f:
                    lines = [line for line in f]
            input_files = [{
                "input_path": line.strip(),
            } for line in lines]
            print(f"input filelst: {self.input_path}")
        elif os.path.isdir(self.input_path):
            input_files = [{
                "input_path": file,
            }for file in glob.glob(os.path.join(self.input_path, '*')) if file.lower().endswith(('.wav', '.mp3'))]
        else:
            raise ValueError(f"input_path {self.input_path} is not a file or directory")


        for input in tqdm(input_files):
            try:
                self.handle(**input)
            except Exception as e:
                print(e)
                continue
        # add average
        avg_values = {}
        for key in self.result_dcit[list(self.result_dcit.keys())[0]].keys():
            avg_values[key] = round(np.mean([self.result_dcit[fid][key] for fid in self.result_dcit]), 4)
        self.result_dcit['average'] = avg_values
        # save result
        with open(os.path.join(self.output_dir, "result.json") , "w")as f:
            json.dump(self.result_dcit, f, indent=4, ensure_ascii=False)

    @torch.no_grad()
    def handle(self, input_path):

        fid = os.path.basename(input_path).split('.')[0]
        if input_path.endswith('.npy'):
            input = np.load(input_path)

            # check ssl
            if len(input.shape) == 3 and input.shape[0] != 1:
                print('ssl_shape error', input_path)
                return
            if np.isnan(input).any():
                print('ssl nan', input_path)
                return

            input = torch.from_numpy(input).to(self.device)
            if len(input.shape) == 2:
                input = input.unsqueeze(0)

        if input_path.endswith(('.wav', '.mp3')):
            wav, sr = librosa.load(input_path, sr=24000)
            audio = torch.tensor(wav).unsqueeze(0).to(self.device)
            output = self.muq(audio, output_hidden_states=True)
            input = output["hidden_states"][6]

        values = {}
        scores_g = self.model(input).squeeze(0)
        values['Coherence'] = round(scores_g[0].item(), 4)
        values['Musicality'] = round(scores_g[1].item(), 4)
        values['Memorability'] = round(scores_g[2].item(), 4)
        values['Clarity'] = round(scores_g[3].item(), 4)
        values['Naturalness'] = round(scores_g[4].item(), 4)


        self.result_dcit[fid] = values
        # delete
        del input, output, scores_g, values,audio, wav, sr


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-i", "--input_path",
        type=str,
        required=True,
        help="Input audio: path to a single file, a text file listing audio paths, or a directory of audio files."
    )
    parser.add_argument(
        "-o", "--output_dir",
        type=str,
        required=True,
        help="Output directory for generated results (will be created if it doesn't exist)."
    )
    parser.add_argument(
        "--use_cpu",
        type=str,
        help="Force CPU mode even if a GPU is available.",
        default=False
    )

    args = parser.parse_args()

    ckpt_path = "ckpt/model.safetensors"

    synthesizer = Synthesizer(checkpoint_path=ckpt_path,
                              input_path=args.input_path,
                              output_dir=args.output_dir,
                              use_cpu=args.use_cpu)

    synthesizer.setup()

    synthesizer.synthesis()