-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Closed
Description
Describe the bug
A clear and concise description of what the bug is.
Traceback (most recent call last)/tmp/ipykernel_2835/486400536.py in <module>
10 audio_file=audio_file,
11 language='ja',
---> 12 device=paddle.get_device())
~/external-libraries/paddlespeech/cli/utils.py in _warpper(self, *args, **kwargs)
326 except Exception:
327 pass
--> 328 return executor_func(self, *args, **kwargs)
329
330 return _warpper
~/external-libraries/paddlespeech/cli/whisper/infer.py in __call__(self, audio_file, model, lang, task, size, language, sample_rate, config, ckpt_path, decode_method, num_decoding_left_chunks, force_yes, rtf, device)
482
483 self.preprocess(model, audio_file)
--> 484 self.infer(model)
485 res = self.postprocess() # Retrieve result of asr.
486
<decorator-gen-695> in infer(self, model_type)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/base.py in _decorate_function(func, *args, **kwargs)
373 def _decorate_function(func, *args, **kwargs):
374 with self:
--> 375 return func(*args, **kwargs)
376
377 @decorator.decorator
~/external-libraries/paddlespeech/cli/whisper/infer.py in infer(self, model_type)
293 initial_prompt=cfg.initial_prompt,
294 condition_on_previous_text=cfg.condition_on_previous_text,
--> 295 no_speech_threshold=cfg.no_speech_threshold)
296
297 def postprocess(self) -> Union[str, os.PathLike]:
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in transcribe(model, mel, resource_path, verbose, temperature, compression_ratio_threshold, logprob_threshold, no_speech_threshold, condition_on_previous_text, **decode_options)
586
587 decode_options["prompt"] = all_tokens[prompt_reset_since:]
--> 588 result: DecodingResult = decode_with_fallback(segment)
589 tokens = paddle.to_tensor(result.tokens)
590
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in decode_with_fallback(segment)
518
519 options = DecodingOptions(**kwargs, temperature=t)
--> 520 decode_result = model.decode(segment, options, resource_path)
521
522 needs_fallback = False
<decorator-gen-692> in decode(model, mel, options, resource_path)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/base.py in _decorate_function(func, *args, **kwargs)
373 def _decorate_function(func, *args, **kwargs):
374 with self:
--> 375 return func(*args, **kwargs)
376
377 @decorator.decorator
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in decode(model, mel, options, resource_path)
1296 mel = mel.unsqueeze(0)
1297
-> 1298 result = DecodingTask(model, options, resource_path).run(mel)
1299
1300 if single:
<decorator-gen-689> in run(self, mel)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/base.py in _decorate_function(func, *args, **kwargs)
373 def _decorate_function(func, *args, **kwargs):
374 with self:
--> 375 return func(*args, **kwargs)
376
377 @decorator.decorator
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in run(self, mel)
1219 # call the main sampling loop
1220 tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features,
-> 1221 tokens)
1222
1223 # reshape the tensors to have (batch_size, beam_size) as the first two dimensions
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in _main_loop(self, audio_features, tokens)
1167 # expand the tokens tensor with the selected next tokens
1168 tokens, completed = self.decoder.update(tokens, logits,
-> 1169 sum_logprobs)
1170 if completed or tokens.shape[-1] > self.n_ctx:
1171 break
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in update(self, tokens, logits, sum_logprobs)
782
783 next_tokens[tokens[:, -1] == self.eot] = self.eot
--> 784 tokens = paddle.concat([tokens, next_tokens[:, None]], axis=-1)
785
786 completed = paddle.all((tokens[:, -1] == self.eot))
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/tensor/manipulation.py in concat(x, axis, name)
1138 if not isinstance(input, Variable):
1139 input = [t for t in input if t.shape.count(0) == 0]
-> 1140 return _C_ops.concat(input, axis)
1141
1142 if _in_legacy_dygraph():
ValueError: (InvalidArgument) The shape of input[0] and input[1] is expected to be equal.But received input[0]'s shape = [5, 3], input[1]'s shape = [5, 1, 51865, 5].
[Hint: Expected inputs_dims[i].size() == out_dims.size(), but received inputs_dims[i].size():4 != out_dims.size():2.] (at /paddle/paddle/phi/kernels/funcs/concat_funcs.h:55)
To Reproduce
Steps to reproduce the behavior:
audio_file = 'audio.wav'
whisper_executor = paddlespeech.cli.whisper.WhisperExecutor()
result = whisper_executor(
model='whisper',
task='transcribe',
size='medium',
sample_rate=16000,
config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
ckpt_path=None,
audio_file=audio_file,
language='ja',
device=paddle.get_device())
Environment (please complete the following information):
- Baidu AIStudio V100 32G
- OS: Ubuntu
- GCC/G++ Version unkonwn
- Python Version 3.7
- PaddlePaddle Version 2.4.0
- Model Version whisper-large
- GPU/DRIVER Information Tesla V100-SXM2-32GB/460.32.03
- CUDA/CUDNN Version cuda-10.2/cuDNN Version-8.2