-
Notifications
You must be signed in to change notification settings - Fork 289
Closed
Description
I am running the run_clm.py
example in hugging face transformers which I invoke with accelerate launch
, which uses the accuracy
metric. I modified the load
call as follows to prevent clashes:
metric = evaluate.load(
"accuracy",
experiment_id=f"{training_args.run_name}_{int(os.environ['RANK'])}_of_{int(os.environ['LOCAL_WORLD_SIZE'])}",
process_id=int(os.environ["RANK"]),
num_process=int(os.environ["LOCAL_WORLD_SIZE"])
)
When the evaluation loop completes after eval_steps
my training job fails with the following error:
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 468, in <module>
main()
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 435, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 2268, in _maybe_log_save_evaluate
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 3019, in evaluate
output = eval_loop(
^^^^^^^^^^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 3310, in evaluation_loop
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 411, in compute_metrics
return metric.compute(predictions=preds, references=labels)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 450, in compute
self.add_batch(**inputs)
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 510, in add_batch
self._init_writer()
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 656, in _init_writer
self._check_all_processes_locks() # wait for everyone to be ready
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 350, in _check_all_processes_locks
raise ValueError(
ValueError: Expected to find locked file /storage/ukp/work/paul/.cache/huggingface/metrics/accuracy/default/ds1_ir_psc_hlrd_0_of_4-4-0.arrow.lock from process 0 but it doesn't exist.
Traceback (most recent call last):
Traceback (most recent call last):
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 468, in <module>
Traceback (most recent call last):
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 468, in <module>
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 468, in <module>
main()
main()
main()
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 435, in main
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 435, in main
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 435, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
train_result = trainer.train(resume_from_checkpoint=checkpoint)
train_result = trainer.train(resume_from_checkpoint=checkpoint)
^^ ^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^
^^ File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 1537, in train
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 1537, in train
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 1537, in train
return inner_training_loop(
return inner_training_loop(
return inner_training_loop(
^^ ^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 1914, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 2268, in _maybe_log_save_evaluate
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 2268, in _maybe_log_save_evaluate
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 2268, in _maybe_log_save_evaluate
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
^ ^^^ ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^ File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 3019, in evaluate
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 3019, in evaluate
^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 3019, in evaluate
output = eval_loop(
output = eval_loop(
output = eval_loop(
^ ^^^^ ^^^^ ^^^^^^^^^^^^^^^^
^^
^^ File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 3310, in evaluation_loop
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 3310, in evaluation_loop
^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/transformers/trainer.py", line 3310, in evaluation_loop
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
^ ^ ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^ File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 411, in compute_metrics
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 411, in compute_metrics
^^
return metric.compute(predictions=preds, references=labels)
return metric.compute(predictions=preds, references=labels)
File "/ukp-storage-1/paul/Projects/LLVM-Align/Experiments/Training/run_ir_clm_trainer.py", line 411, in compute_metrics
return metric.compute(predictions=preds, references=labels)
^^ ^^^^ ^^^^ ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^ File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 450, in compute
^^ File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 450, in compute
^^
self.add_batch(**inputs)self.add_batch(**inputs)
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 450, in compute
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 510, in add_batch
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 510, in add_batch
self.add_batch(**inputs)
self._init_writer()
self._init_writer() File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 510, in add_batch
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 659, in _init_writer
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 659, in _init_writer
self._init_writer()
self._check_rendez_vous() # wait for master to be ready and to let everyone go
self._check_rendez_vous() # wait for master to be ready and to let everyone go File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 659, in _init_writer
^ self._check_rendez_vous() # wait for master to be ready and to let everyone go
^^ ^ ^^^^ ^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^ File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 362, in _check_rendez_vous
^^^
^^
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 362, in _check_rendez_vous
raise ValueError(
File "/mnt/beegfs/work/paul/Miniforge/envs/code-gen/lib/python3.11/site-packages/evaluate/module.py", line 362, in _check_rendez_vous
ValueError : Expected to find locked file /storage/ukp/work/paul/.cache/huggingface/metrics/accuracy/default/ds1_ir_psc_hlrd_3_of_4-4-0.arrow.lock from process 3 but it doesn't exist.raise ValueError(
raise ValueError(
ValueError: ValueErrorExpected to find locked file /storage/ukp/work/paul/.cache/huggingface/metrics/accuracy/default/ds1_ir_psc_hlrd_1_of_4-4-0.arrow.lock from process 1 but it doesn't exist.: Expected to find locked file /storage/ukp/work/paul/.cache/huggingface/metrics/accuracy/default/ds1_ir_psc_hlrd_2_of_4-4-0.arrow.lock from process 2 but it doesn't exist.
I have checked the folder and the files it says doesn't exist are present. I am running the script on 4 GPUs on a single node in SLURM. Please help me with how to proceed or debug this.
Metadata
Metadata
Assignees
Labels
No labels