1127 update to latest

2025-11-27 15:44:17 +08:00
parent e16c84aab2
commit a34d39430e
153 changed files with 25705 additions and 53 deletions
--- a/dllm/scripts/accelerate_configs/cpu.yaml
+++ b/dllm/scripts/accelerate_configs/cpu.yaml
@ -0,0 +1,7 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: NO
+mixed_precision: "no"
+num_processes: 1
+machine_rank: 0
+num_machines: 1
+downcast_bf16: "no"
--- a/dllm/scripts/accelerate_configs/ddp.yaml
+++ b/dllm/scripts/accelerate_configs/ddp.yaml
@ -0,0 +1,6 @@
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+machine_rank: 0
+num_machines: 1
+num_processes: 8
--- a/dllm/scripts/accelerate_configs/fsdp.yaml
+++ b/dllm/scripts/accelerate_configs/fsdp.yaml
@ -0,0 +1,56 @@
+# compute_environment: LOCAL_MACHINE
+# debug: false
+# distributed_type: FSDP
+# downcast_bf16: 'no'
+# enable_cpu_affinity: false
+# fsdp_config:
+#   fsdp_activation_checkpointing: true # Need fix from: https://github.com/huggingface/transformers/pull/36610
+#   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+#   fsdp_backward_prefetch: BACKWARD_PRE
+#   fsdp_cpu_ram_efficient_loading: true
+#   fsdp_forward_prefetch: true
+#   fsdp_offload_params: false
+#   fsdp_sharding_strategy: FULL_SHARD
+#   fsdp_state_dict_type: FULL_STATE_DICT
+#   fsdp_sync_module_states: true
+#   fsdp_use_orig_params: true
+# machine_rank: 0
+# main_training_function: main
+# mixed_precision: bf16
+# num_machines: 1
+# num_processes: 8
+# rdzv_backend: static
+# same_network: true
+# tpu_env: []
+# tpu_use_cluster: false
+# tpu_use_sudo: false
+# use_cpu: false
+
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_POST
+  fsdp_forward_prefetch: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/dllm/scripts/accelerate_configs/zero1.yaml
+++ b/dllm/scripts/accelerate_configs/zero1.yaml
@ -0,0 +1,19 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  zero3_init_flag: false
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+# mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/dllm/scripts/accelerate_configs/zero2.yaml
+++ b/dllm/scripts/accelerate_configs/zero2.yaml
@ -0,0 +1,21 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+# mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/dllm/scripts/accelerate_configs/zero3.yaml
+++ b/dllm/scripts/accelerate_configs/zero3.yaml
@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+# mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/dllm/scripts/accelerate_configs/zero3_moe.yaml
+++ b/dllm/scripts/accelerate_configs/zero3_moe.yaml
@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_moe_layer_cls_names: RND1DecoderLayer # LLaDAMoEDecoderLayer
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false