GitHub

dependency

pip3 install transformers==4.45.2 tqdm ipdb accelerate numpy shortuuid fschat fastchat

specify python env

export PYTHONPATH=$PYTHONPATH::/home/TreeDecoding/

test env . parallel decoding

CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 ParallelDecodingModel.py --eval_mode two_model --draft_model Llama-3.2-1B-Instruct  --target_model Llama-3.1-8B-Instruct --max_tokens 512

test env：single model decoding

CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=1 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 ParallelDecodingModel.py --eval_mode single_model  --model_name Llama-3.1-8B-Instruct --max_tokens 512 
CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=1 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 ParallelDecodingModel.py --eval_mode single_model  --model_name Llama-3.1-70B-Instruct --max_tokens 512

profile 70B model

CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=2 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 ParallelDecodingModel.py --eval_mode two_model --draft_model Llama-3.1-8B-Instruct  --target_model Llama-3.1-70B-Instruct --max_tokens 512 --nodes_per_layer 50 --communication_ratio 6
CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=2 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 eval_gsm8k.py --eval_mode two_model --draft_model Llama-3.1-8B-Instruct  --target_model Llama-3.1-70B-Instruct --max_tokens 512 --nodes_per_layer 50
CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=2 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 eval_humaneval.py --eval_mode two_model --draft_model Llama-3.1-8B-Instruct  --target_model Llama-3.1-70B-Instruct --max_tokens 512 --nodes_per_layer 50
CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=2 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 eval_mgsm.py --eval_mode two_model --draft_model Llama-3.1-8B-Instruct  --target_model Llama-3.1-70B-Instruct --max_tokens 512 --nodes_per_layer 50
CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=2 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 eval_mt_bench.py --eval_mode two_model --draft_model Llama-3.1-8B-Instruct  --target_model Llama-3.1-70B-Instruct --max_tokens 512 --nodes_per_layer 50
CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=2 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 eval_mbpp.py --eval_mode two_model --draft_model Llama-3.1-8B-Instruct  --target_model Llama-3.1-70B-Instruct --max_tokens 512 --nodes_per_layer 50

test autoregressive decoding

CUDA_VISIBLE_DEVICES=0,1,2 torchrun --nproc_per_node=1 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 ParallelDecodingModel.py --eval_mode single_model  --model_name Llama-3.1-70B-Instruct --max_tokens 512 --nodes_per_layer 100

cd benchmark

CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node=2 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 eval_gsm8k.py --eval_mode two_model --draft_model Llama-3.2-1B-Instruct  --target_model Llama-3.1-8B-Instruct --max_tokens 512

profile autoregressive executation time

CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 --master_addr=127.0.0.1 --master_port=12345 eval_gsm8k.py --eval_mode single_model  --model_name Llama-3.1-8B-Instruct --max_tokens 512

Name		Name	Last commit message	Last commit date
Latest commit History 26 Commits
benchmark		benchmark
data		data
drafter_decoding		drafter_decoding
.gitignore		.gitignore
README.md		README.md
test_decoding_model.py		test_decoding_model.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

About

Uh oh!

Releases

Packages

Languages

hunzhizi/CARD

Folders and files

Latest commit

History

Repository files navigation

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages