-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Open
Description
This is my .sh file:
set -x
# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
# export VLLM_ATTENTION_BACKEND=XFORMERS
#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$GEMINI_CODE/material_ds/train.parquet \
data.val_files=$GEMINI_CODE/material_ds/val.parquet \
data.train_batch_size=8 \
data.max_prompt_length=512 \
data.max_response_length=4096 \
data.filter_overlong_prompts=True \
data.truncation='error' \
custom_reward_function.path=$GEMINI_CODE/verl/verl/utils/reward_score/multi_options.py \
actor_rollout_ref.model.path=$GEMINI_DATA_IN1 \
actor_rollout_ref.actor.optim.lr=3e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.clip_ratio_low=0.2 \
actor_rollout_ref.actor.clip_ratio_high=0.28 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=4 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
algorithm.kl_ctrl.kl_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_grpo_material' \
trainer.experiment_name='qwen2.5_7b_test_iroha' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=1000 \
trainer.test_freq=50 \
trainer.total_epochs=1 $@
And this is my console info:
(TaskRunner pid=92717) ANTLR runtime and generated code versions disagree: 4.7.2!=4.9.3
(TaskRunner pid=92717) ANTLR runtime and generated code versions disagree: 4.7.2!=4.9.3
(TaskRunner pid=92717) {'actor_rollout_ref': {'actor': {'checkpoint': {'contents': ['model',
(TaskRunner pid=92717) 'optimizer',
(TaskRunner pid=92717) 'extra']},
(TaskRunner pid=92717) 'clip_ratio': 0.2,
(TaskRunner pid=92717) 'clip_ratio_c': 3.0,
(TaskRunner pid=92717) 'clip_ratio_high': 0.28,
(TaskRunner pid=92717) 'clip_ratio_low': 0.2,
(TaskRunner pid=92717) 'entropy_coeff': 0,
(TaskRunner pid=92717) 'fsdp_config': {'fsdp_size': -1,
(TaskRunner pid=92717) 'offload_policy': False,
(TaskRunner pid=92717) 'optimizer_offload': False,
(TaskRunner pid=92717) 'param_offload': False,
(TaskRunner pid=92717) 'reshard_after_forward': True,
(TaskRunner pid=92717) 'wrap_policy': {'min_num_params': 0}},
(TaskRunner pid=92717) 'grad_clip': 1.0,
(TaskRunner pid=92717) 'kl_loss_coef': 0.001,
(TaskRunner pid=92717) 'kl_loss_type': 'low_var_kl',
(TaskRunner pid=92717) 'loss_agg_mode': 'token-mean',
(TaskRunner pid=92717) 'optim': {'lr': 3e-06,
(TaskRunner pid=92717) 'lr_warmup_steps': 10,
(TaskRunner pid=92717) 'lr_warmup_steps_ratio': 0.0,
(TaskRunner pid=92717) 'min_lr_ratio': None,
(TaskRunner pid=92717) 'total_training_steps': -1,
(TaskRunner pid=92717) 'warmup_style': 'constant',
(TaskRunner pid=92717) 'weight_decay': 0.01},
(TaskRunner pid=92717) 'ppo_epochs': 1,
(TaskRunner pid=92717) 'ppo_max_token_len_per_gpu': 16384,
(TaskRunner pid=92717) 'ppo_micro_batch_size': None,
(TaskRunner pid=92717) 'ppo_micro_batch_size_per_gpu': 2,
(TaskRunner pid=92717) 'ppo_mini_batch_size': 4,
(TaskRunner pid=92717) 'shuffle': False,
(TaskRunner pid=92717) 'strategy': 'fsdp',
(TaskRunner pid=92717) 'ulysses_sequence_parallel_size': 1,
(TaskRunner pid=92717) 'use_dynamic_bsz': False,
(TaskRunner pid=92717) 'use_kl_loss': True,
(TaskRunner pid=92717) 'use_torch_compile': True},
(TaskRunner pid=92717) 'hybrid_engine': True,
(TaskRunner pid=92717) 'model': {'enable_gradient_checkpointing': True,
(TaskRunner pid=92717) 'external_lib': None,
(TaskRunner pid=92717) 'override_config': {},
(TaskRunner pid=92717) 'path': '/gemini/data-1',
(TaskRunner pid=92717) 'use_liger': False,
(TaskRunner pid=92717) 'use_remove_padding': True},
(TaskRunner pid=92717) 'ref': {'fsdp_config': {'param_offload': True,
(TaskRunner pid=92717) 'reshard_after_forward': True,
(TaskRunner pid=92717) 'wrap_policy': {'min_num_params': 0}},
(TaskRunner pid=92717) 'log_prob_max_token_len_per_gpu': 16384,
(TaskRunner pid=92717) 'log_prob_micro_batch_size': None,
(TaskRunner pid=92717) 'log_prob_micro_batch_size_per_gpu': 4,
(TaskRunner pid=92717) 'log_prob_use_dynamic_bsz': False,
(TaskRunner pid=92717) 'strategy': 'fsdp',
(TaskRunner pid=92717) 'ulysses_sequence_parallel_size': 1,
(TaskRunner pid=92717) 'use_torch_compile': True},
(TaskRunner pid=92717) 'rollout': {'chat_scheduler': None,
(TaskRunner pid=92717) 'disable_log_stats': True,
(TaskRunner pid=92717) 'do_sample': True,
(TaskRunner pid=92717) 'dtype': 'bfloat16',
(TaskRunner pid=92717) 'enable_chunked_prefill': True,
(TaskRunner pid=92717) 'enforce_eager': True,
(TaskRunner pid=92717) 'engine_kwargs': {'swap_space': None},
(TaskRunner pid=92717) 'free_cache_engine': True,
(TaskRunner pid=92717) 'gpu_memory_utilization': 0.4,
(TaskRunner pid=92717) 'ignore_eos': False,
(TaskRunner pid=92717) 'load_format': 'dummy_dtensor',
(TaskRunner pid=92717) 'log_prob_max_token_len_per_gpu': 16384,
(TaskRunner pid=92717) 'log_prob_micro_batch_size': None,
(TaskRunner pid=92717) 'log_prob_micro_batch_size_per_gpu': 4,
(TaskRunner pid=92717) 'log_prob_use_dynamic_bsz': False,
(TaskRunner pid=92717) 'max_model_len': None,
(TaskRunner pid=92717) 'max_num_batched_tokens': 8192,
(TaskRunner pid=92717) 'max_num_seqs': 1024,
(TaskRunner pid=92717) 'mode': 'sync',
(TaskRunner pid=92717) 'multi_turn': {'enable': False,
(TaskRunner pid=92717) 'format': 'chatml',
(TaskRunner pid=92717) 'max_turns': None,
(TaskRunner pid=92717) 'tool_config_path': None},
(TaskRunner pid=92717) 'n': 16,
(TaskRunner pid=92717) 'name': 'vllm',
(TaskRunner pid=92717) 'prompt_length': 512,
(TaskRunner pid=92717) 'response_length': 4096,
(TaskRunner pid=92717) 'temperature': 1.0,
(TaskRunner pid=92717) 'tensor_model_parallel_size': 2,
(TaskRunner pid=92717) 'top_k': -1,
(TaskRunner pid=92717) 'top_p': 1,
(TaskRunner pid=92717) 'use_fire_sampling': False,
(TaskRunner pid=92717) 'val_kwargs': {'do_sample': False,
(TaskRunner pid=92717) 'n': 1,
(TaskRunner pid=92717) 'temperature': 0,
(TaskRunner pid=92717) 'top_k': -1,
(TaskRunner pid=92717) 'top_p': 1.0}}},
(TaskRunner pid=92717) 'algorithm': {'adv_estimator': 'grpo',
(TaskRunner pid=92717) 'gamma': 1.0,
(TaskRunner pid=92717) 'kl_ctrl': {'horizon': 10000,
(TaskRunner pid=92717) 'kl_coef': 0.001,
(TaskRunner pid=92717) 'target_kl': 0.1,
(TaskRunner pid=92717) 'type': 'fixed'},
(TaskRunner pid=92717) 'kl_penalty': 'kl',
(TaskRunner pid=92717) 'lam': 1.0,
(TaskRunner pid=92717) 'norm_adv_by_std_in_grpo': True,
(TaskRunner pid=92717) 'use_kl_in_reward': False},
(TaskRunner pid=92717) 'critic': {'checkpoint': {'contents': ['model', 'optimizer', 'extra']},
(TaskRunner pid=92717) 'cliprange_value': 0.5,
(TaskRunner pid=92717) 'forward_max_token_len_per_gpu': 32768,
(TaskRunner pid=92717) 'forward_micro_batch_size': None,
(TaskRunner pid=92717) 'forward_micro_batch_size_per_gpu': None,
(TaskRunner pid=92717) 'grad_clip': 1.0,
(TaskRunner pid=92717) 'model': {'enable_gradient_checkpointing': True,
(TaskRunner pid=92717) 'external_lib': None,
(TaskRunner pid=92717) 'fsdp_config': {'fsdp_size': -1,
(TaskRunner pid=92717) 'offload_policy': False,
(TaskRunner pid=92717) 'optimizer_offload': False,
(TaskRunner pid=92717) 'param_offload': False,
(TaskRunner pid=92717) 'reshard_after_forward': True,
(TaskRunner pid=92717) 'wrap_policy': {'min_num_params': 0}},
(TaskRunner pid=92717) 'override_config': {},
(TaskRunner pid=92717) 'path': '~/models/deepseek-llm-7b-chat',
(TaskRunner pid=92717) 'tokenizer_path': '/gemini/data-1',
(TaskRunner pid=92717) 'use_remove_padding': False},
(TaskRunner pid=92717) 'optim': {'lr': 1e-05,
(TaskRunner pid=92717) 'lr_warmup_steps_ratio': 0.0,
(TaskRunner pid=92717) 'min_lr_ratio': None,
(TaskRunner pid=92717) 'total_training_steps': -1,
(TaskRunner pid=92717) 'warmup_style': 'constant',
(TaskRunner pid=92717) 'weight_decay': 0.01},
(TaskRunner pid=92717) 'ppo_epochs': 1,
(TaskRunner pid=92717) 'ppo_max_token_len_per_gpu': 32768,
(TaskRunner pid=92717) 'ppo_micro_batch_size': None,
(TaskRunner pid=92717) 'ppo_micro_batch_size_per_gpu': None,
(TaskRunner pid=92717) 'ppo_mini_batch_size': 4,
(TaskRunner pid=92717) 'rollout_n': 16,
(TaskRunner pid=92717) 'shuffle': False
(TaskRunner pid=92717) ,
(TaskRunner pid=92717) 'strategy'
(TaskRunner pid=92717) :
(TaskRunner pid=92717) 'fsdp'
(TaskRunner pid=92717) ,
(TaskRunner pid=92717) 'ulysses_sequence_parallel_size'
(TaskRunner pid=92717) :
(TaskRunner pid=92717) 1,
(TaskRunner pid=92717) 'use_dynamic_bsz': False},
(TaskRunner pid=92717) 'custom_reward_function': {'name': 'compute_score',
(TaskRunner pid=92717) 'path': '/gemini/code/verl/verl/utils/reward_score/multi_options.py'},
(TaskRunner pid=92717) 'data': {'custom_cls': {'name': None, 'path': None},
(TaskRunner pid=92717) 'filter_overlong_prompts': True,
(TaskRunner pid=92717) 'filter_overlong_prompts_workers': 1,
(TaskRunner pid=92717) 'image_key': 'images',
(TaskRunner pid=92717) 'max_prompt_length': 512,
(TaskRunner pid=92717) 'max_response_length': 4096,
(TaskRunner pid=92717) 'prompt_key': 'prompt',
(TaskRunner pid=92717) 'return_raw_chat': False,
(TaskRunner pid=92717) 'return_raw_input_ids': False,
(TaskRunner pid=92717) 'reward_fn_key': 'data_source',
(TaskRunner pid=92717) 'shuffle': True,
(TaskRunner pid=92717) 'tokenizer': None,
(TaskRunner pid=92717) 'train_batch_size': 8,
(TaskRunner pid=92717) 'train_files': '/gemini/code/material_ds/train.parquet',
(TaskRunner pid=92717) 'truncation': 'error',
(TaskRunner pid=92717) 'val_batch_size': None,
(TaskRunner pid=92717) 'val_files': '/gemini/code/material_ds/val.parquet',
(TaskRunner pid=92717) 'video_key': 'videos'},
(TaskRunner pid=92717) 'ray_init': {'num_cpus': None},
(TaskRunner pid=92717) 'reward_model': {'enable': False,
(TaskRunner pid=92717) 'forward_max_token_len_per_gpu': 32768,
(TaskRunner pid=92717) 'launch_reward_fn_async': False,
(TaskRunner pid=92717) 'max_length': None,
(TaskRunner pid=92717) 'micro_batch_size': None,
(TaskRunner pid=92717) 'micro_batch_size_per_gpu': None,
(TaskRunner pid=92717) 'model': {'external_lib': None,
(TaskRunner pid=92717) 'fsdp_config': {'fsdp_size': -1,
(TaskRunner pid=92717) 'param_offload': False,
(TaskRunner pid=92717) 'reshard_after_forward': True,
(TaskRunner pid=92717) 'wrap_policy': {'min_num_params': 0}},
(TaskRunner pid=92717) 'input_tokenizer': '/gemini/data-1',
(TaskRunner pid=92717) 'path': '~/models/FsfairX-LLaMA3-RM-v0.1',
(TaskRunner pid=92717) 'use_remove_padding': False},
(TaskRunner pid=92717) 'reward_manager': 'naive',
(TaskRunner pid=92717) 'strategy': 'fsdp',
(TaskRunner pid=92717) 'ulysses_sequence_parallel_size': 1,
(TaskRunner pid=92717) 'use_dynamic_bsz': False},
(TaskRunner pid=92717) 'trainer': {'balance_batch': True,
(TaskRunner pid=92717) 'critic_warmup': 0,
(TaskRunner pid=92717) 'default_hdfs_dir': None,
(TaskRunner pid=92717) 'default_local_dir': '/gemini/output/checkpoints/verl_grpo_material/qwen2.5_7b_test_iroha',
(TaskRunner pid=92717) 'del_local_ckpt_after_load': False,
(TaskRunner pid=92717) 'experiment_name': 'qwen2.5_7b_test_iroha',
(TaskRunner pid=92717) 'log_val_generations': 0,
(TaskRunner pid=92717) 'logger': ['console'],
(TaskRunner pid=92717) 'max_actor_ckpt_to_keep': None,
(TaskRunner pid=92717) 'max_critic_ckpt_to_keep': None,
(TaskRunner pid=92717) 'n_gpus_per_node': 8,
(TaskRunner pid=92717) 'nnodes': 1,
(TaskRunner pid=92717) 'project_name': 'verl_grpo_material',
(TaskRunner pid=92717) 'ray_wait_register_center_timeout': 300,
(TaskRunner pid=92717) 'resume_from_path': None,
(TaskRunner pid=92717) 'resume_mode': 'auto',
(TaskRunner pid=92717) 'rollout_data_dir': None,
(TaskRunner pid=92717) 'save_freq': 1000,
(TaskRunner pid=92717) 'test_freq': 50,
(TaskRunner pid=92717) 'total_epochs': 1,
(TaskRunner pid=92717) 'total_training_steps': None,
(TaskRunner pid=92717) 'val_before_train': True,
(TaskRunner pid=92717) 'validation_data_dir': None}}
(TaskRunner pid=92717) using customized reward function 'compute_score' from '/gemini/code/verl/verl/utils/reward_score/multi_options.py'
(TaskRunner pid=92717) using customized reward function 'compute_score' from '/gemini/code/verl/verl/utils/reward_score/multi_options.py'
(TaskRunner pid=92717) Using dataset class: RLHFDataset
(TaskRunner pid=92717) dataset len: 9096
Filtering prompts longer than 512 tokens: 0%| | 0/9096 [00:00<?, ? examples/s]
Filtering prompts longer than 512 tokens: 11%|█ | 1000/9096 [00:00<00:05, 1523.93 examples/s]
Filtering prompts longer than 512 tokens: 22%|██▏ | 2000/9096 [00:01<00:04, 1536.78 examples/s]
Filtering prompts longer than 512 tokens: 33%|███▎ | 3000/9096 [00:01<00:03, 1562.08 examples/s]
Filtering prompts longer than 512 tokens: 44%|████▍ | 4000/9096 [00:02<00:03, 1570.49 examples/s]
Filtering prompts longer than 512 tokens: 55%|█████▍ | 5000/9096 [00:03<00:02, 1579.07 examples/s]
Filtering prompts longer than 512 tokens: 66%|██████▌ | 6000/9096 [00:03<00:01, 1588.17 examples/s]
Filtering prompts longer than 512 tokens: 77%|███████▋ | 7000/9096 [00:04<00:01, 1593.59 examples/s]
Filtering prompts longer than 512 tokens: 88%|████████▊ | 8000/9096 [00:05<00:00, 1596.14 examples/s]
Filtering prompts longer than 512 tokens: 100%|██████████| 9096/9096 [00:05<00:00, 1579.48 examples/s]
(TaskRunner pid=92717) filter dataset len: 9093
(TaskRunner pid=92717) Using dataset class: RLHFDataset
(TaskRunner pid=92717) dataset len: 1000
Filtering prompts longer than 512 tokens: 0%| | 0/1000 [00:00<?, ? examples/s]
Filtering prompts longer than 512 tokens: 100%|██████████| 1000/1000 [00:00<00:00, 1578.28 examples/s]
(TaskRunner pid=92717) filter dataset len: 999
(TaskRunner pid=92717) [validate_config] All configuration checks passed successfully!
(TaskRunner pid=92717) DeprecationWarning: `ray.state.available_resources_per_node` is a private attribute and access will be removed in a future Ray version.
(TaskRunner pid=92717) Size of train dataloader: 1136, Size of val dataloader: 1
(TaskRunner pid=92717) Total training steps: 1136
(TaskRunner pid=92717) colocated worker base class <class 'verl.single_controller.base.worker.Worker'>
(TaskRunner pid=92717) WARNING:2025-05-20 13:51:31,780:Waiting for register center actor QYiQ3q_register_center to be ready. Elapsed time: 0 seconds out of 300 seconds.
(WorkerDict pid=97566) You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]
(WorkerDict pid=97568) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 38.56it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 42.98it/s]
(WorkerDict pid=97568) [rank6]:[W520 13:51:47.162122574 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 6] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id.
(WorkerDict pid=96953) Model config after override: Qwen2Config {
(WorkerDict pid=96953) "architectures": [
(WorkerDict pid=96953) "Qwen2ForCausalLM"
(WorkerDict pid=96953) ],
(WorkerDict pid=96953) "attention_dropout": 0.0,
(WorkerDict pid=96953) "eos_token_id": 151645,
(WorkerDict pid=96953) "hidden_act": "silu",
(WorkerDict pid=96953) "hidden_size": 3584,
(WorkerDict pid=96953) "initializer_range": 0.02,
(WorkerDict pid=96953) "intermediate_size": 18944,
(WorkerDict pid=96953) "max_position_embeddings": 32768,
(WorkerDict pid=96953) "max_window_layers": 28,
(WorkerDict pid=96953) "model_type": "qwen2",
(WorkerDict pid=96953) "num_attention_heads": 28,
(WorkerDict pid=96953) "num_hidden_layers": 28,
(WorkerDict pid=96953) "num_key_value_heads": 4,
(WorkerDict pid=96953) "pad_token_id": 151643,
(WorkerDict pid=96953) "rms_norm_eps": 1e-06,
(WorkerDict pid=96953) "rope_scaling": null,
(WorkerDict pid=96953) "rope_theta": 1000000.0,
(WorkerDict pid=96953) "sliding_window": 131072,
(WorkerDict pid=96953) "tie_word_embeddings": false,
(WorkerDict pid=96953) "torch_dtype": "bfloat16",
(WorkerDict pid=96953) "transformers_version": "4.51.3",
(WorkerDict pid=96953) "use_cache": true,
(WorkerDict pid=96953) "use_sliding_window": false,
(WorkerDict pid=96953) "vocab_size": 152064
(WorkerDict pid=96953) }
(WorkerDict pid=96953)
(WorkerDict pid=96953) NCCL version 2.21.5+cuda12.4
(WorkerDict pid=96953) Qwen2ForCausalLM contains 7.62B parameters
(WorkerDict pid=96953) wrap_policy: functools.partial(<function _or_policy at 0x7f64ed2df6d0>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0x7f64ed2df5b0>, transformer_layer_cls={<class 'transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer'>})])
(WorkerDict pid=96953) Actor use_remove_padding=True
(WorkerDict pid=96953) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention [repeated 7x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)
(WorkerDict pid=97565) wrap_policy: functools.partial(<function _or_policy at 0x7fd4309436d0>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0x7fd4309435b0>, transformer_layer_cls={<class 'transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer'>})]) [repeated 7x across cluster]
(WorkerDict pid=96953) Model config after override: Qwen2Config {
(WorkerDict pid=96953) "architectures": [
(WorkerDict pid=96953) "Qwen2ForCausalLM"
(WorkerDict pid=96953) ],
(WorkerDict pid=96953) "attention_dropout": 0.0,
(WorkerDict pid=96953) "eos_token_id": 151645,
(WorkerDict pid=96953) "hidden_act": "silu",
(WorkerDict pid=96953) "hidden_size": 3584,
(WorkerDict pid=96953) "initializer_range": 0.02,
(WorkerDict pid=96953) "intermediate_size": 18944,
(WorkerDict pid=96953) "max_position_embeddings": 32768,
(WorkerDict pid=96953) "max_window_layers": 28,
(WorkerDict pid=96953) "model_type": "qwen2",
(WorkerDict pid=96953) "num_attention_heads": 28,
(WorkerDict pid=96953) "num_hidden_layers": 28,
(WorkerDict pid=96953) "num_key_value_heads": 4,
(WorkerDict pid=96953) "pad_token_id": 151643,
(WorkerDict pid=96953) "rms_norm_eps": 1e-06,
(WorkerDict pid=96953) "rope_scaling": null,
(WorkerDict pid=96953) "rope_theta": 1000000.0,
(WorkerDict pid=96953) "sliding_window": 131072,
(WorkerDict pid=96953) "tie_word_embeddings": false,
(WorkerDict pid=96953) "torch_dtype": "bfloat16",
(WorkerDict pid=96953) "transformers_version": "4.51.3",
(WorkerDict pid=96953) "use_cache": true,
(WorkerDict pid=96953) "use_sliding_window": false,
(WorkerDict pid=96953) "vocab_size": 152064
(WorkerDict pid=96953) }
(WorkerDict pid=96953)
(WorkerDict pid=96953) Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen2ForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
(WorkerDict pid=96953) You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. [repeated 7x across cluster]
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s] [repeated 8x across cluster]
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 33.96it/s] [repeated 6x across cluster]
(WorkerDict pid=96953) [rank0]:[W520 13:51:48.937597330 ProcessGroupNCCL.cpp:4561] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. Specify device_ids in barrier() to force use of a particular device, or call init_process_group() with a device_id. [repeated 7x across cluster]
Loading checkpoint shards: 25%|██▌ | 1/4 [00:02<00:07, 2.45s/it]
(WorkerDict pid=97569) Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Qwen2ForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)` [repeated 7x across cluster]
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s] [repeated 7x across cluster]
Loading checkpoint shards: 75%|███████▌ | 3/4 [00:07<00:02, 2.51s/it] [repeated 16x across cluster]
(WorkerDict pid=97565) Actor use_remove_padding=True [repeated 7x across cluster]
(WorkerDict pid=96953) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00, 2.43s/it]
(WorkerDict pid=97567) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention
Loading checkpoint shards: 75%|███████▌ | 3/4 [00:10<00:03, 3.48s/it] [repeated 7x across cluster]
(WorkerDict pid=96953) Qwen2ForCausalLM contains 7.62B parameters
(WorkerDict pid=96953) wrap_policy: functools.partial(<function _or_policy at 0x7f64ed2df6d0>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0x7f64ed2df5b0>, transformer_layer_cls={<class 'transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer'>})])
(WorkerDict pid=97566) Total steps: 1136, num_warmup_steps: 10
(WorkerDict pid=97566) Actor use_remove_padding=True
(WorkerDict pid=97569) Monkey patch _flash_attention_forward in transformers.integrations.flash_attention [repeated 6x across cluster]
(WorkerDict pid=97565) wrap_policy: functools.partial(<function _or_policy at 0x7fd4309436d0>, policies=[functools.partial(<function transformer_auto_wrap_policy at 0x7fd4309435b0>, transformer_layer_cls={<class 'transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer'>})]) [repeated 7x across cluster]
(WorkerDict pid=97563) Actor use_remove_padding=True
TraceBack here:
Traceback (most recent call last):
File "/gemini/code/verl/verl/trainer/main_ppo.py", line 64, in main
run_ppo(config)
File "/gemini/code/verl/verl/trainer/main_ppo.py", line 76, in run_ppo
ray.get(runner.run.remote(config))
File "/root/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/root/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/root/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_private/worker.py", line 2822, in get
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
File "/root/miniconda3/envs/verl/lib/python3.10/site-packages/ray/_private/worker.py", line 930, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ActorDiedError): ray::TaskRunner.run() (pid=92717, ip=10.244.61.194, actor_id=7e20de0a314d4e1a2dd2290301000000, repr=<main_ppo.TaskRunner object at 0x7f065bd45de0>)
File "/gemini/code/verl/verl/trainer/main_ppo.py", line 182, in run
trainer.init_workers()
File "/gemini/code/verl/verl/trainer/ppo/ray_trainer.py", line 738, in init_workers
self.actor_rollout_wg.init_model()
File "/gemini/code/verl/verl/single_controller/ray/base.py", line 49, in func
output = ray.get(output)
ray.exceptions.ActorDiedError: The actor died unexpectedly before finishing this task.
class_name: create_colocated_worker_cls.<locals>.WorkerDict
actor_id: 1d0b093d1f86691622baca3d01000000
pid: 97568
name: QYiQ3qWorkerDict_0:6
namespace: 0ed6ba2f-d9c4-4464-95ef-25db26fc3bd2
ip: 10.244.61.194
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
(raylet) A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff78deb03788b39734978864e501000000 Worker ID: 35f4e37bf5f070d5c35b3016ab5b331eb7d790c3863965846f4403f9 Node ID: b73256901e3b81e6e62974514bb76c31f0ef8b29e0d33f8ebb1ca011 Worker IP address: 10.244.61.194 Worker port: 36809 Worker PID: 97569 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors. [repeated 4x across cluster]
(WorkerDict pid=97563) [rank1]:[F520 14:15:45.165853183 ProcessGroupNCCL.cpp:1575] [PG ID 0 PG GUID 0(default_pg) Rank 1] [PG ID 0 PG GUID 0(default_pg) Rank 1] Terminating the process after attempting to dump debug info, due to ProcessGroupNCCL watchdog hang. [repeated 7x across cluster]
(WorkerDict pid=97563) *** SIGABRT received at time=1747721745 on cpu 54 *** [repeated 7x across cluster]
(WorkerDict pid=97563) PC: @ 0x7f89c688600b (unknown) raise [repeated 7x across cluster]
(WorkerDict pid=97563) @ 0x7f89c6a49420 1244648272 (unknown) [repeated 7x across cluster]
(WorkerDict pid=97563) @ ... and at least 1 more frames [repeated 7x across cluster]
(WorkerDict pid=97563) [2025-05-20 14:15:45,507 E 97563 98481] logging.cc:496: *** SIGABRT received at time=1747721745 on cpu 54 *** [repeated 7x across cluster]
(WorkerDict pid=97563) [2025-05-20 14:15:45,507 E 97563 98481] logging.cc:496: PC: @ 0x7f89c688600b (unknown) raise [repeated 7x across cluster]
(WorkerDict pid=97563) [2025-05-20 14:15:45,507 E 97563 98481] logging.cc:496: @ 0x7f89c6a49420 1244648272 (unknown) [repeated 7x across cluster]
(WorkerDict pid=97563) [2025-05-20 14:15:45,507 E 97563 98481] logging.cc:496: @ ... and at least 1 more frames [repeated 7x across cluster]
(WorkerDict pid=97563) Fatal Python error: Aborted [repeated 7x across cluster]
Here's nvidia-smi info:
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06 Driver Version: 525.125.06 CUDA Version: 12.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... On | 00000000:01:00.0 Off | Off |
| 43% 27C P2 66W / 450W | 21614MiB / 24564MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce ... On | 00000000:24:00.0 Off | Off |
| 43% 27C P2 72W / 450W | 17562MiB / 24564MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA GeForce ... On | 00000000:41:00.0 Off | Off |
| 42% 26C P2 59W / 450W | 17590MiB / 24564MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA GeForce ... On | 00000000:61:00.0 Off | Off |
| 42% 27C P2 69W / 450W | 17564MiB / 24564MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 4 NVIDIA GeForce ... On | 00000000:81:00.0 Off | Off |
| 42% 27C P2 63W / 450W | 17546MiB / 24564MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 5 NVIDIA GeForce ... On | 00000000:A1:00.0 Off | Off |
| 44% 26C P2 63W / 450W | 17580MiB / 24564MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 6 NVIDIA GeForce ... On | 00000000:C1:00.0 Off | Off |
| 43% 27C P2 55W / 450W | 17566MiB / 24564MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 7 NVIDIA GeForce ... On | 00000000:E1:00.0 Off | Off |
| 40% 25C P2 63W / 450W | 17584MiB / 24564MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
+-----------------------------------------------------------------------------+
No error infomation, but GPU utilization stays 0%. Can you help me figure out what's wrong?
Saito-Karuha, Soistesimmer and Jnnndjjsnxbhhunheng
Metadata
Metadata
Assignees
Labels
No labels