1127 update to latest
This commit is contained in:
7
dllm/scripts/accelerate_configs/cpu.yaml
Normal file
7
dllm/scripts/accelerate_configs/cpu.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
distributed_type: NO
|
||||
mixed_precision: "no"
|
||||
num_processes: 1
|
||||
machine_rank: 0
|
||||
num_machines: 1
|
||||
downcast_bf16: "no"
|
||||
6
dllm/scripts/accelerate_configs/ddp.yaml
Normal file
6
dllm/scripts/accelerate_configs/ddp.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
distributed_type: MULTI_GPU
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
num_machines: 1
|
||||
num_processes: 8
|
||||
56
dllm/scripts/accelerate_configs/fsdp.yaml
Normal file
56
dllm/scripts/accelerate_configs/fsdp.yaml
Normal file
@ -0,0 +1,56 @@
|
||||
# compute_environment: LOCAL_MACHINE
|
||||
# debug: false
|
||||
# distributed_type: FSDP
|
||||
# downcast_bf16: 'no'
|
||||
# enable_cpu_affinity: false
|
||||
# fsdp_config:
|
||||
# fsdp_activation_checkpointing: true # Need fix from: https://github.com/huggingface/transformers/pull/36610
|
||||
# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
# fsdp_backward_prefetch: BACKWARD_PRE
|
||||
# fsdp_cpu_ram_efficient_loading: true
|
||||
# fsdp_forward_prefetch: true
|
||||
# fsdp_offload_params: false
|
||||
# fsdp_sharding_strategy: FULL_SHARD
|
||||
# fsdp_state_dict_type: FULL_STATE_DICT
|
||||
# fsdp_sync_module_states: true
|
||||
# fsdp_use_orig_params: true
|
||||
# machine_rank: 0
|
||||
# main_training_function: main
|
||||
# mixed_precision: bf16
|
||||
# num_machines: 1
|
||||
# num_processes: 8
|
||||
# rdzv_backend: static
|
||||
# same_network: true
|
||||
# tpu_env: []
|
||||
# tpu_use_cluster: false
|
||||
# tpu_use_sudo: false
|
||||
# use_cpu: false
|
||||
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
enable_cpu_affinity: false
|
||||
fsdp_config:
|
||||
fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_backward_prefetch: BACKWARD_POST
|
||||
fsdp_forward_prefetch: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_forward_prefetch: true
|
||||
fsdp_offload_params: false
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_use_orig_params: true
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
19
dllm/scripts/accelerate_configs/zero1.yaml
Normal file
19
dllm/scripts/accelerate_configs/zero1.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
deepspeed_config:
|
||||
deepspeed_multinode_launcher: standard
|
||||
zero3_init_flag: false
|
||||
zero_stage: 1
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
# mixed_precision: 'bf16'
|
||||
num_machines: 1
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
21
dllm/scripts/accelerate_configs/zero2.yaml
Normal file
21
dllm/scripts/accelerate_configs/zero2.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
deepspeed_config:
|
||||
deepspeed_multinode_launcher: standard
|
||||
offload_optimizer_device: none
|
||||
offload_param_device: none
|
||||
zero3_init_flag: false
|
||||
zero_stage: 2
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
# mixed_precision: 'bf16'
|
||||
num_machines: 1
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
22
dllm/scripts/accelerate_configs/zero3.yaml
Normal file
22
dllm/scripts/accelerate_configs/zero3.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
deepspeed_config:
|
||||
deepspeed_multinode_launcher: standard
|
||||
offload_optimizer_device: none
|
||||
offload_param_device: none
|
||||
zero3_init_flag: true
|
||||
zero3_save_16bit_model: true
|
||||
zero_stage: 3
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
# mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
25
dllm/scripts/accelerate_configs/zero3_moe.yaml
Normal file
25
dllm/scripts/accelerate_configs/zero3_moe.yaml
Normal file
@ -0,0 +1,25 @@
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
deepspeed_config:
|
||||
deepspeed_moe_layer_cls_names: RND1DecoderLayer # LLaDAMoEDecoderLayer
|
||||
gradient_accumulation_steps: 1
|
||||
gradient_clipping: 1.0
|
||||
offload_optimizer_device: none
|
||||
offload_param_device: none
|
||||
zero3_init_flag: true
|
||||
zero3_save_16bit_model: true
|
||||
zero_stage: 3
|
||||
distributed_type: DEEPSPEED
|
||||
downcast_bf16: 'no'
|
||||
enable_cpu_affinity: false
|
||||
machine_rank: 0
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 1
|
||||
num_processes: 8
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
292
dllm/scripts/eval.slurm.sh
Normal file
292
dllm/scripts/eval.slurm.sh
Normal file
@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env bash
|
||||
#SBATCH --job-name=model-eval
|
||||
#SBATCH --partition=mllm_safety
|
||||
#SBATCH --quotatype=spot
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --gres=gpu:8
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --time=20:00:00
|
||||
#SBATCH --output=logs/%x-%j.out
|
||||
#SBATCH --error=logs/%x-%j.err
|
||||
#SBATCH --requeue
|
||||
|
||||
# ============================================================
|
||||
# Unified Evaluation Configuration + Execution Script
|
||||
# ============================================================
|
||||
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Declare associative arrays
|
||||
# ------------------------------------------------------------
|
||||
declare -A eval_llada_base_configs
|
||||
declare -A eval_llada_instruct_configs
|
||||
|
||||
declare -A eval_dream_base_configs
|
||||
declare -A eval_dream_instruct_configs
|
||||
|
||||
declare -A eval_bert_configs
|
||||
|
||||
|
||||
# ============================================================
|
||||
# ==================== LLaDA CONFIGS ========================
|
||||
# ============================================================
|
||||
# eval_llada_configs["<dataset>"]="num_fewshot|max_new_tokens|steps|block_length|seed|mc_num|cfg"
|
||||
# ============================================================
|
||||
|
||||
# ---------- Base Generation ----------
|
||||
eval_llada_base_configs["gsm8k"]="8|1024|1024|32|1234|128|0.0"
|
||||
eval_llada_base_configs["bbh"]="3|1024|1024|32|1234|128|0.0"
|
||||
eval_llada_base_configs["minerva_math"]="4|1024|1024|32|1234|128|0.0"
|
||||
eval_llada_base_configs["humaneval"]="0|1024|1024|32|1234|128|0.0"
|
||||
eval_llada_base_configs["mbpp"]="3|1024|1024|32|1234|128|0.0"
|
||||
|
||||
# ---------- Base Likelihood ----------
|
||||
eval_llada_base_configs["gpqa_main_n_shot"]="5|1024|1024|1024|1234|128|0.5"
|
||||
eval_llada_base_configs["truthfulqa_mc2"]="0|1024|1024|1024|1234|128|2.0"
|
||||
eval_llada_base_configs["arc_challenge"]="0|1024|1024|1024|1234|128|0.5"
|
||||
eval_llada_base_configs["hellaswag"]="0|1024|1024|1024|1234|128|0.5"
|
||||
eval_llada_base_configs["winogrande"]="5|1024|1024|1024|1234|128|0.0"
|
||||
eval_llada_base_configs["piqa"]="0|1024|1024|1024|1234|128|0.5"
|
||||
eval_llada_base_configs["mmlu"]="5|1024|1024|1024|1234|1|0.0"
|
||||
eval_llada_base_configs["cmmlu"]="5|1024|1024|1024|1234|1|0.0"
|
||||
eval_llada_base_configs["ceval-valid"]="5|1024|1024|1024|1234|1|0.0"
|
||||
|
||||
# ---------- Instruct Generation ----------
|
||||
eval_llada_instruct_configs["gsm8k_cot"]="8|1024|1024|32|1234|1|0.0"
|
||||
eval_llada_instruct_configs["bbh"]="3|1024|1024|32|1234|1|0.0"
|
||||
eval_llada_instruct_configs["minerva_math"]="4|1024|1024|32|1234|1|0.0"
|
||||
eval_llada_instruct_configs["humaneval_instruct"]="0|1024|1024|32|1234|1|0.0"
|
||||
eval_llada_instruct_configs["mbpp_llada_instruct"]="3|1024|1024|32|1234|1|0.0"
|
||||
|
||||
eval_llada_instruct_configs["mmlu_generative"]="0|3|3|3|1234|1|0.0"
|
||||
eval_llada_instruct_configs["mmlu_pro"]="0|256|256|256|1234|1|0.0"
|
||||
eval_llada_instruct_configs["hellaswag_gen"]="0|3|3|3|1234|1|0.0"
|
||||
eval_llada_instruct_configs["arc_challarc_challenge_chatenge"]="0|5|5|5|1234|1|0.0"
|
||||
eval_llada_instruct_configs["gpqa_n_shot_gen"]="5|32|32|32|1234|1|0.0"
|
||||
|
||||
# ============================================================
|
||||
# ==================== DREAM CONFIGS ========================
|
||||
# ============================================================
|
||||
# eval_dream_configs["<dataset>"]="num_fewshot|max_new_tokens|steps|temperature|top_p|seed|mc_num"
|
||||
# ============================================================
|
||||
|
||||
# ---------- Base Generation ----------
|
||||
eval_dream_base_configs["humaneval_dream"]="0|512|512|0.2|0.95|1234|1"
|
||||
eval_dream_base_configs["gsm8k_cot"]="8|256|256|0.0|0.95|1234|1"
|
||||
eval_dream_base_configs["mbpp"]="3|512|512|0.2|0.95|1234|1"
|
||||
eval_dream_base_configs["minerva_math"]="4|512|512|0.0|0.95|1234|1"
|
||||
eval_dream_base_configs["bbh"]="3|512|512|0.0|0.95|1234|1"
|
||||
|
||||
# ---------- Base Likelihood ----------
|
||||
eval_dream_base_configs["mmlu"]="5|512|512|0.0|0.95|1234|128"
|
||||
eval_dream_base_configs["arc_easy"]="0|512|512|0.0|0.95|1234|128"
|
||||
eval_dream_base_configs["arc_challenge"]="0|512|512|0.0|0.95|1234|128"
|
||||
eval_dream_base_configs["hellaswag"]="0|512|512|0.0|0.95|1234|128"
|
||||
eval_dream_base_configs["piqa"]="0|512|512|0.0|0.95|1234|128"
|
||||
eval_dream_base_configs["gpqa_main_n_shot"]="5|512|512|0.0|0.95|1234|128"
|
||||
eval_dream_base_configs["winogrande"]="5|512|512|0.0|0.95|1234|128"
|
||||
eval_dream_base_configs["race"]="0|512|512|0.0|0.95|1234|128"
|
||||
|
||||
# ---------- Instruct Generation ----------
|
||||
eval_dream_instruct_configs["mmlu_generative"]="4|128|128|0.1|0.9|1234|1"
|
||||
eval_dream_instruct_configs["mmlu_generative_dream"]="4|128|128|0.1|0.9|1234|1"
|
||||
eval_dream_instruct_configs["mmlu_pro"]="4|128|128|0.1|0.9|1234|1"
|
||||
eval_dream_instruct_configs["gsm8k_cot"]="0|256|256|0.1|0.9|1234|1"
|
||||
eval_dream_instruct_configs["minerva_math"]="0|512|512|0.1|0.9|1234|1"
|
||||
eval_dream_instruct_configs["gpqa_main_n_shot"]="5|128|128|0.0|1.0|1234|1"
|
||||
eval_dream_instruct_configs["humaneval_instruct"]="0|768|768|0.1|0.9|1234|1"
|
||||
eval_dream_instruct_configs["mbpp_instruct"]="0|1024|1024|0.1|0.9|1234|1"
|
||||
eval_dream_instruct_configs["mbpp_instruct_dream"]="0|1024|1024|0.1|0.9|1234|1"
|
||||
eval_dream_instruct_configs["ifeval"]="0|1280|1280|0.1|0.9|1234|1"
|
||||
|
||||
# ============================================================
|
||||
# ==================== BERT CONFIGS =========================
|
||||
# ============================================================
|
||||
# eval_bert_configs["<dataset>"]="num_fewshot|max_new_tokens|steps|block_length|seed|mc_num"
|
||||
# ============================================================
|
||||
|
||||
eval_bert_configs["mmlu"]="5|512|512|32|1234|128"
|
||||
eval_bert_configs["ceval-valid"]="5|1024|1024|32|1234|128"
|
||||
eval_bert_configs["cmmlu"]="5|1024|1024|32|1234|128"
|
||||
eval_bert_configs["hellaswag"]="0|1024|1024|1024|1234|128"
|
||||
eval_bert_configs["winogrande"]="0|128|128|128|1234|128"
|
||||
|
||||
eval_bert_configs["gsm8k_bert"]="8|256|256|32|1234|128"
|
||||
eval_bert_configs["minerva_math"]="4|256|256|32|1234|128"
|
||||
eval_bert_configs["humaneval"]="0|256|256|32|1234|128"
|
||||
eval_bert_configs["bbh"]="3|256|256|32|1234|128"
|
||||
|
||||
|
||||
eval_bert_configs["hellaswag_gen"]="0|128|128|128|1234|1"
|
||||
eval_bert_configs["mmlu_generative"]="0|128|128|128|1234|1"
|
||||
eval_bert_configs["mmlu_pro"]="0|256|256|256|1234|1"
|
||||
eval_bert_configs["arc_challenge_chat"]="0|128|128|128|1234|1"
|
||||
|
||||
# ============================================================
|
||||
# ====================== END CONFIGS ========================
|
||||
# ============================================================
|
||||
|
||||
|
||||
# ============================================================
|
||||
# ====================== END CONFIGS ========================
|
||||
# ============================================================
|
||||
|
||||
|
||||
# ===== Derived variables =====
|
||||
NUM_NODES=${SLURM_NNODES}
|
||||
GPUS_PER_NODE=$(echo "${SLURM_JOB_GPUS}" | tr ',' '\n' | wc -l)
|
||||
WORLD_SIZE=$((NUM_NODES * GPUS_PER_NODE))
|
||||
MASTER_PORT=$((20000 + SLURM_JOB_ID % 10000))
|
||||
NODELIST=($(scontrol show hostnames "${SLURM_JOB_NODELIST}"))
|
||||
MASTER_ADDR=${NODELIST[0]}
|
||||
TRAIN_NODES=("${NODELIST[@]}")
|
||||
|
||||
echo "============================"
|
||||
echo "JOB NAME: ${SLURM_JOB_NAME}"
|
||||
echo "JOB ID: ${SLURM_JOB_ID}"
|
||||
echo "NUM_NODES: ${NUM_NODES}"
|
||||
echo "WORLD_SIZE: ${WORLD_SIZE}"
|
||||
echo "MASTER: ${MASTER_ADDR}:${MASTER_PORT}"
|
||||
echo "============================"
|
||||
|
||||
# ===== Environment =====
|
||||
export PYTHONBREAKPOINT=0
|
||||
export NCCL_ASYNC_ERROR_HANDLING=1
|
||||
export NCCL_DEBUG=warn
|
||||
export TORCH_DISTRIBUTED_DEBUG=DETAIL
|
||||
export PYTHONPATH=.:$PYTHONPATH
|
||||
export HF_ALLOW_CODE_EVAL=1
|
||||
export HF_DATASETS_TRUST_REMOTE_CODE=1 # For cmmlu dataset
|
||||
export MASTER_ADDR MASTER_PORT WORLD_SIZE
|
||||
|
||||
|
||||
MODEL_CLASS=${1,,} # "llada" or "dream"
|
||||
TASK=${2:-"gsm8k"} # dataset name
|
||||
MODEL_NAME=${3} # model path or name (required)
|
||||
INSTRUCT=${4:-"False"} # whether to evaluate instruct model
|
||||
BATCH_SIZE=${5:-"1"} # control batchsize
|
||||
USE_LOG=${6:-"False"} # optional: enable logging
|
||||
LIMIT=${7:-"None"} # optional: limit number of test samples (default None)
|
||||
|
||||
|
||||
if [[ -z "${MODEL_NAME}" ]]; then
|
||||
echo "❌ Missing model name/path argument!"
|
||||
echo "Usage: sbatch eval_model.sh <model_class> <task> <model_name_or_path> [instruct] [batch_size]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${MODEL_NAME}" == /* ]]; then
|
||||
MODEL_PATH="${MODEL_NAME}"
|
||||
else
|
||||
MODEL_PATH="${BASE_MODELS_DIR}/${MODEL_NAME}"
|
||||
fi
|
||||
|
||||
case "${MODEL_CLASS}" in
|
||||
llada)
|
||||
if [[ "${INSTRUCT,,}" == "true" ]]; then
|
||||
CONFIG="${eval_llada_instruct_configs[$TASK]}"
|
||||
CONFIG_SET="instruct"
|
||||
else
|
||||
CONFIG="${eval_llada_base_configs[$TASK]}"
|
||||
CONFIG_SET="base"
|
||||
fi
|
||||
|
||||
if [[ -z "${CONFIG}" ]]; then
|
||||
echo "❌ Unknown task '${TASK}' for LLaDA (${CONFIG_SET} mode)."
|
||||
echo "Available tasks (base): ${!eval_llada_base_configs[@]}"
|
||||
echo "Available tasks (instruct): ${!eval_llada_instruct_configs[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
IFS="|" read -r NUM_FEWSHOT MAX_NEW_TOKENS STEPS BLOCK_LENGTH SEED MC_NUM CFG <<< "${CONFIG}"
|
||||
|
||||
MODEL_TYPE="llada"
|
||||
SCRIPT_PATH="dllm/pipelines/llada/eval.py"
|
||||
MODEL_ARGS="pretrained=${MODEL_PATH},is_check_greedy=False,mc_num=${MC_NUM},max_new_tokens=${MAX_NEW_TOKENS},steps=${STEPS},block_length=${BLOCK_LENGTH},cfg=${CFG}"
|
||||
;;
|
||||
|
||||
dream)
|
||||
if [[ "${INSTRUCT,,}" == "true" ]]; then
|
||||
CONFIG="${eval_dream_instruct_configs[$TASK]}"
|
||||
CONFIG_SET="instruct"
|
||||
else
|
||||
CONFIG="${eval_dream_base_configs[$TASK]}"
|
||||
CONFIG_SET="base"
|
||||
fi
|
||||
|
||||
if [[ -z "${CONFIG}" ]]; then
|
||||
echo "❌ Unknown task '${TASK}' for Dream (${CONFIG_SET} mode)."
|
||||
echo "Available tasks (base): ${!eval_dream_base_configs[@]}"
|
||||
echo "Available tasks (instruct): ${!eval_dream_instruct_configs[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
IFS="|" read -r NUM_FEWSHOT MAX_NEW_TOKENS STEPS TEMPERATURE TOP_P SEED MC_NUM <<< "${CONFIG}"
|
||||
|
||||
MODEL_TYPE="dream"
|
||||
SCRIPT_PATH="dllm/pipelines/dream/eval.py"
|
||||
MODEL_ARGS="pretrained=${MODEL_PATH},mc_num=${MC_NUM},max_new_tokens=${MAX_NEW_TOKENS},steps=${STEPS},temperature=${TEMPERATURE},top_p=${TOP_P},add_bos_token=true,escape_until=true"
|
||||
;;
|
||||
|
||||
bert)
|
||||
CONFIG="${eval_bert_configs[$TASK]}"
|
||||
if [[ -z "${CONFIG}" ]]; then
|
||||
echo "❌ Unknown task '${TASK}' for BERT."
|
||||
echo "Available tasks: ${!eval_bert_configs[@]}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
IFS="|" read -r NUM_FEWSHOT MAX_NEW_TOKENS STEPS BLOCK_LENGTH SEED MC_NUM <<< "${CONFIG}"
|
||||
|
||||
MODEL_TYPE="bert"
|
||||
SCRIPT_PATH="dllm/pipelines/bert/eval.py"
|
||||
MODEL_ARGS="pretrained=${MODEL_PATH},is_check_greedy=False,mc_num=${MC_NUM},max_new_tokens=${MAX_NEW_TOKENS},steps=${STEPS},block_length=${BLOCK_LENGTH}"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "❌ Invalid model_class '${MODEL_CLASS}'. Must be 'llada' or 'dream' or 'bert'."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
[[ "${INSTRUCT}" == "True" ]] && APPLY_CHAT_TEMPLATE_ARG="--apply_chat_template True" || APPLY_CHAT_TEMPLATE_ARG=""
|
||||
[[ "${LIMIT}" == "None" ]] && LIMIT_ARG="" || LIMIT_ARG="--limit ${LIMIT}"
|
||||
[[ "${USE_LOG}" == "True" ]] && \
|
||||
LOG_ARG="--log_samples --output_path ./logs/${MODEL_CLASS}_${TASK}_${SLURM_JOB_ID}_samples.json" \
|
||||
|| LOG_ARG="--output_path ./logs/${MODEL_CLASS}_${TASK}_${SLURM_JOB_ID}_samples.json"
|
||||
|
||||
echo -e "\nLaunching ${MODEL_CLASS} on ${TASK} using ${MODEL_PATH}"
|
||||
echo "============================"
|
||||
echo "Few-shot: ${NUM_FEWSHOT}"
|
||||
echo "Seed: ${SEED}"
|
||||
echo "Batch size: ${BATCH_SIZE}"
|
||||
echo "Use chat template: ${USE_CHAT_TEMPLATE}"
|
||||
echo "============================"
|
||||
|
||||
RUN_CMD="accelerate launch \
|
||||
--num_processes ${WORLD_SIZE} \
|
||||
--num_machines ${NUM_NODES} \
|
||||
--main_process_ip ${MASTER_ADDR} \
|
||||
--main_process_port ${MASTER_PORT} \
|
||||
--machine_rank ${SLURM_PROCID} \
|
||||
${SCRIPT_PATH} \
|
||||
--num_fewshot ${NUM_FEWSHOT} \
|
||||
--batch_size ${BATCH_SIZE} \
|
||||
--model ${MODEL_TYPE} \
|
||||
--model_args \"${MODEL_ARGS}\" \
|
||||
--tasks ${TASK} \
|
||||
--seed ${SEED} \
|
||||
${LOG_ARG} \
|
||||
--confirm_run_unsafe_code \
|
||||
${LIMIT_ARG} \
|
||||
${APPLY_CHAT_TEMPLATE_ARG}"
|
||||
|
||||
if [[ "${NUM_NODES}" -eq 1 ]]; then
|
||||
echo "Single-node execution"
|
||||
eval ${RUN_CMD}
|
||||
else
|
||||
echo "Multi-node execution"
|
||||
srun --nodes="${NUM_NODES}" --ntasks="${NUM_NODES}" --nodelist="${SLURM_JOB_NODELIST}" ${RUN_CMD}
|
||||
fi
|
||||
144
dllm/scripts/tests/test_attention_mask.py
Normal file
144
dllm/scripts/tests/test_attention_mask.py
Normal file
@ -0,0 +1,144 @@
|
||||
"""
|
||||
LLaDA / MoE / Dream / RND attention mask invariance tests (compact version)
|
||||
"""
|
||||
|
||||
import gc
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
import dllm
|
||||
import pytest
|
||||
|
||||
ERROR_THRESHOLD = 1e-3
|
||||
|
||||
|
||||
def _cuda_cleanup():
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
# Reclaim interprocess memory blocks (useful after large model del)
|
||||
try:
|
||||
torch.cuda.ipc_collect()
|
||||
except Exception:
|
||||
# Not all PyTorch builds expose ipc_collect on all platforms
|
||||
pass
|
||||
|
||||
|
||||
def _forward_variants(model):
|
||||
"""
|
||||
Run the 5 padding/mask variants and return tensors sliced to the 'real' tokens [1,2,3,4].
|
||||
Returns dict: {'A','B','C','D','E'} each [1, 4, H]
|
||||
"""
|
||||
device = model.device
|
||||
|
||||
# A: no padding
|
||||
a_ids = torch.tensor([[1, 2, 3, 4]], device=device)
|
||||
a_mask = torch.tensor([[1, 1, 1, 1]], device=device)
|
||||
|
||||
# B: left-pad a 0
|
||||
b_ids = torch.tensor([[0, 1, 2, 3, 4]], device=device)
|
||||
b_mask = torch.tensor([[0, 1, 1, 1, 1]], device=device)
|
||||
|
||||
# C: right-pad a 0
|
||||
c_ids = torch.tensor([[1, 2, 3, 4, 0]], device=device)
|
||||
c_mask = torch.tensor([[1, 1, 1, 1, 0]], device=device)
|
||||
|
||||
# D: same as A but attention_mask=None
|
||||
d_ids = torch.tensor([[1, 2, 3, 4]], device=device)
|
||||
d_mask = None
|
||||
|
||||
# E: same as A but omit attention_mask entirely
|
||||
e_ids = torch.tensor([[1, 2, 3, 4]], device=device)
|
||||
|
||||
with torch.no_grad():
|
||||
out_A = model(input_ids=a_ids, attention_mask=a_mask).logits # [1,4,H]
|
||||
out_B = model(input_ids=b_ids, attention_mask=b_mask).logits[:, 1:] # [1,4,H]
|
||||
out_C = model(input_ids=c_ids, attention_mask=c_mask).logits[:, :-1] # [1,4,H]
|
||||
out_D = model(input_ids=d_ids, attention_mask=d_mask).logits # [1,4,H]
|
||||
out_E = model(input_ids=e_ids).logits # [1,4,H]
|
||||
|
||||
return {"A": out_A, "B": out_B, "C": out_C, "D": out_D, "E": out_E}
|
||||
|
||||
|
||||
def _assert_invariance(outs: dict, tag: str):
|
||||
ref = outs["A"]
|
||||
for k in ("B", "C", "D", "E"):
|
||||
assert torch.allclose(
|
||||
ref, outs[k], atol=ERROR_THRESHOLD, rtol=ERROR_THRESHOLD
|
||||
), f"[{tag}] Mismatch A vs {k}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"repo, attn_impl, human_name",
|
||||
[
|
||||
("GSAI-ML/LLaDA-8B-Base", None, "LLaDA Base"),
|
||||
("inclusionAI/LLaDA-MoE-7B-A1B-Base", None, "LLaDA MoE"),
|
||||
("Dream-org/Dream-v0-Base-7B", None, "Dream Base"),
|
||||
("radicalnumerics/RND1-Base-0910", None, "RND Base (native)"),
|
||||
("radicalnumerics/RND1-Base-0910", "sdpa", "RND Base (SDPA)"),
|
||||
],
|
||||
)
|
||||
def test_attention_mask_invariance(repo, attn_impl, human_name):
|
||||
"""
|
||||
For each model/backend:
|
||||
1) Check padding/mask invariance across A..E on the 'real' tokens.
|
||||
2) Print a ✅ message for debug visibility (pytest still enforces assertions).
|
||||
"""
|
||||
model_path = dllm.utils.resolve_with_base_env(repo, "BASE_MODELS_DIR")
|
||||
|
||||
if attn_impl is None:
|
||||
model = transformers.AutoModel.from_pretrained(
|
||||
model_path, dtype=torch.float32, device_map="auto"
|
||||
).eval()
|
||||
else:
|
||||
config = transformers.AutoConfig.from_pretrained(
|
||||
model_path, attn_implementation=attn_impl
|
||||
)
|
||||
model = transformers.AutoModel.from_pretrained(
|
||||
model_path, config=config, dtype=torch.float32, device_map="auto"
|
||||
).eval()
|
||||
|
||||
outs = _forward_variants(model)
|
||||
_assert_invariance(outs, human_name)
|
||||
|
||||
print(f"✅ {human_name} attention mask invariance passed within {ERROR_THRESHOLD}.")
|
||||
del model
|
||||
gc.collect()
|
||||
_cuda_cleanup()
|
||||
|
||||
|
||||
def test_rnd_native_vs_sdpa_equivalence():
|
||||
"""
|
||||
Verify RND (native attention) and RND (SDPA) produce equivalent logits on the
|
||||
same real tokens across A..E variants.
|
||||
"""
|
||||
repo = "radicalnumerics/RND1-Base-0910"
|
||||
model_path = dllm.utils.resolve_with_base_env(repo, "BASE_MODELS_DIR")
|
||||
|
||||
# native
|
||||
model_native = transformers.AutoModel.from_pretrained(
|
||||
model_path, dtype=torch.float32, device_map="auto"
|
||||
).eval()
|
||||
|
||||
# sdpa
|
||||
config_sdpa = transformers.AutoConfig.from_pretrained(
|
||||
model_path, attn_implementation="sdpa"
|
||||
)
|
||||
model_sdpa = transformers.AutoModel.from_pretrained(
|
||||
model_path, config=config_sdpa, dtype=torch.float32, device_map="auto"
|
||||
).eval()
|
||||
|
||||
outs_native = _forward_variants(model_native) # expects helper from your file
|
||||
outs_sdpa = _forward_variants(model_sdpa)
|
||||
|
||||
for k in ("A", "B", "C", "D", "E"):
|
||||
assert torch.allclose(
|
||||
outs_native[k], outs_sdpa[k], atol=ERROR_THRESHOLD, rtol=ERROR_THRESHOLD
|
||||
), f"[RND cross-backend] native vs SDPA mismatch on {k}"
|
||||
|
||||
print(f"✅ RND native vs SDPA equivalence passed within {ERROR_THRESHOLD}.")
|
||||
# Explicitly drop model references
|
||||
del model_native
|
||||
del model_sdpa
|
||||
# Collect Python garbage and release CUDA caches
|
||||
gc.collect()
|
||||
_cuda_cleanup()
|
||||
0
dllm/scripts/tests/test_dream_generation.py
Normal file
0
dllm/scripts/tests/test_dream_generation.py
Normal file
77
dllm/scripts/train.slurm.sh
Normal file
77
dllm/scripts/train.slurm.sh
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env bash
|
||||
#SBATCH --job-name=dllm
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --gres=gpu:8
|
||||
#SBATCH --cpus-per-task=24
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --partition=mllm_safety
|
||||
#SBATCH --quotatype=spot
|
||||
#SBATCH --output=./logs/%x-%j.out
|
||||
#SBATCH --err=./logs/%x-%j.err
|
||||
#SBATCH --requeue
|
||||
#SBATCH --time=3-00:00:00
|
||||
|
||||
# ===== Cluster variables =====
|
||||
NUM_NODES=${SLURM_NNODES}
|
||||
GPUS_PER_NODE=$(echo "$CUDA_VISIBLE_DEVICES" | tr ',' '\n' | wc -l)
|
||||
WORLD_SIZE=$((NUM_NODES * GPUS_PER_NODE))
|
||||
NODELIST=($(scontrol show hostnames "${SLURM_JOB_NODELIST}"))
|
||||
MASTER_ADDR=${NODELIST[0]}
|
||||
MASTER_PORT=$((20000 + SLURM_JOB_ID % 10000))
|
||||
TRAIN_NODES=("${NODELIST[@]}")
|
||||
|
||||
echo "===== System Variables ====="
|
||||
{
|
||||
echo "NUM_NODES=$NUM_NODES"
|
||||
echo "GPUS_PER_NODE=$GPUS_PER_NODE"
|
||||
echo "WORLD_SIZE=$WORLD_SIZE"
|
||||
echo "MASTER_ADDR=$MASTER_ADDR"
|
||||
echo "MASTER_PORT=$MASTER_PORT"
|
||||
} | column -t -s=
|
||||
|
||||
echo "Nodes allocated:"
|
||||
for node in "${TRAIN_NODES[@]}"; do
|
||||
echo " - $node"
|
||||
done
|
||||
echo "============================"
|
||||
|
||||
# ===== Environment =====
|
||||
export NCCL_ASYNC_ERROR_HANDLING=1
|
||||
export PYTHONPATH=.
|
||||
|
||||
# ===== Default options =====
|
||||
accelerate_config="zero2"
|
||||
script_path="scripts/examples/llada_sft.py"
|
||||
|
||||
# ===== Parse arguments =====
|
||||
# Stop parsing known options as soon as we hit an unknown one
|
||||
FORWARD_ARGS=()
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--accelerate_config)
|
||||
accelerate_config="$2"; shift 2 ;;
|
||||
--script_path)
|
||||
script_path="$2"; shift 2 ;;
|
||||
*)
|
||||
FORWARD_ARGS=("$@"); break ;; # everything else goes to the training script
|
||||
esac
|
||||
done
|
||||
|
||||
echo "===== Script Variables ====="
|
||||
echo "--accelerate_config ${accelerate_config}"
|
||||
echo "--script_path ${script_path}"
|
||||
echo "--forwarded script args:"
|
||||
printf '%s\n' "${FORWARD_ARGS[@]}" | xargs -n 2
|
||||
echo "============================"
|
||||
|
||||
# ===== Launch =====
|
||||
srun --nodes="${NUM_NODES}" --ntasks="${NUM_NODES}" --nodelist="${SLURM_JOB_NODELIST}" \
|
||||
accelerate launch \
|
||||
--config_file "scripts/accelerate_configs/${accelerate_config}.yaml" \
|
||||
--num_machines "${NUM_NODES}" \
|
||||
--num_processes "${WORLD_SIZE}" \
|
||||
--main_process_ip "${MASTER_ADDR}" \
|
||||
--main_process_port "${MASTER_PORT}" \
|
||||
--machine_rank "${SLURM_PROCID}" \
|
||||
--rdzv_backend c10d \
|
||||
"${script_path}" "${FORWARD_ARGS[@]}"
|
||||
Reference in New Issue
Block a user