[S2T] Whisper ASR Model excution got ValueError

**Describe the bug**
A clear and concise description of what the bug is.

```python-traceback
Traceback (most recent call last)/tmp/ipykernel_2835/486400536.py in <module>
     10     audio_file=audio_file,
     11     language='ja',
---> 12     device=paddle.get_device())
~/external-libraries/paddlespeech/cli/utils.py in _warpper(self, *args, **kwargs)
    326         except Exception:
    327             pass
--> 328         return executor_func(self, *args, **kwargs)
    329 
    330     return _warpper
~/external-libraries/paddlespeech/cli/whisper/infer.py in __call__(self, audio_file, model, lang, task, size, language, sample_rate, config, ckpt_path, decode_method, num_decoding_left_chunks, force_yes, rtf, device)
    482 
    483         self.preprocess(model, audio_file)
--> 484         self.infer(model)
    485         res = self.postprocess()  # Retrieve result of asr.
    486 
<decorator-gen-695> in infer(self, model_type)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/base.py in _decorate_function(func, *args, **kwargs)
    373         def _decorate_function(func, *args, **kwargs):
    374             with self:
--> 375                 return func(*args, **kwargs)
    376 
    377         @decorator.decorator
~/external-libraries/paddlespeech/cli/whisper/infer.py in infer(self, model_type)
    293             initial_prompt=cfg.initial_prompt,
    294             condition_on_previous_text=cfg.condition_on_previous_text,
--> 295             no_speech_threshold=cfg.no_speech_threshold)
    296 
    297     def postprocess(self) -> Union[str, os.PathLike]:
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in transcribe(model, mel, resource_path, verbose, temperature, compression_ratio_threshold, logprob_threshold, no_speech_threshold, condition_on_previous_text, **decode_options)
    586 
    587             decode_options["prompt"] = all_tokens[prompt_reset_since:]
--> 588             result: DecodingResult = decode_with_fallback(segment)
    589             tokens = paddle.to_tensor(result.tokens)
    590 
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in decode_with_fallback(segment)
    518 
    519             options = DecodingOptions(**kwargs, temperature=t)
--> 520             decode_result = model.decode(segment, options, resource_path)
    521 
    522             needs_fallback = False
<decorator-gen-692> in decode(model, mel, options, resource_path)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/base.py in _decorate_function(func, *args, **kwargs)
    373         def _decorate_function(func, *args, **kwargs):
    374             with self:
--> 375                 return func(*args, **kwargs)
    376 
    377         @decorator.decorator
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in decode(model, mel, options, resource_path)
   1296         mel = mel.unsqueeze(0)
   1297 
-> 1298     result = DecodingTask(model, options, resource_path).run(mel)
   1299 
   1300     if single:
<decorator-gen-689> in run(self, mel)
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/base.py in _decorate_function(func, *args, **kwargs)
    373         def _decorate_function(func, *args, **kwargs):
    374             with self:
--> 375                 return func(*args, **kwargs)
    376 
    377         @decorator.decorator
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in run(self, mel)
   1219         # call the main sampling loop
   1220         tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features,
-> 1221                                                                 tokens)
   1222 
   1223         # reshape the tensors to have (batch_size, beam_size) as the first two dimensions
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in _main_loop(self, audio_features, tokens)
   1167                 # expand the tokens tensor with the selected next tokens
   1168                 tokens, completed = self.decoder.update(tokens, logits,
-> 1169                                                         sum_logprobs)
   1170                 if completed or tokens.shape[-1] > self.n_ctx:
   1171                     break
~/external-libraries/paddlespeech/s2t/models/whisper/whipser.py in update(self, tokens, logits, sum_logprobs)
    782 
    783         next_tokens[tokens[:, -1] == self.eot] = self.eot
--> 784         tokens = paddle.concat([tokens, next_tokens[:, None]], axis=-1)
    785 
    786         completed = paddle.all((tokens[:, -1] == self.eot))
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/tensor/manipulation.py in concat(x, axis, name)
   1138         if not isinstance(input, Variable):
   1139             input = [t for t in input if t.shape.count(0) == 0]
-> 1140         return _C_ops.concat(input, axis)
   1141 
   1142     if _in_legacy_dygraph():
ValueError: (InvalidArgument) The shape of input[0] and input[1] is expected to be equal.But received input[0]'s shape = [5, 3], input[1]'s shape = [5, 1, 51865, 5].
  [Hint: Expected inputs_dims[i].size() == out_dims.size(), but received inputs_dims[i].size():4 != out_dims.size():2.] (at /paddle/paddle/phi/kernels/funcs/concat_funcs.h:55)
```

**To Reproduce**
Steps to reproduce the behavior:
```py
audio_file = 'audio.wav'
whisper_executor = paddlespeech.cli.whisper.WhisperExecutor()
result = whisper_executor(
    model='whisper',
    task='transcribe',
    size='medium',
    sample_rate=16000,
    config=None,  # Set `config` and `ckpt_path` to None to use pretrained model.
    ckpt_path=None,
    audio_file=audio_file,
    language='ja',
    device=paddle.get_device())
```

**Environment (please complete the following information):**

 - **Baidu AIStudio V100 32G**
 - OS: Ubuntu
 - GCC/G++ Version unkonwn
 - Python Version 3.7
 - PaddlePaddle Version 2.4.0
 - Model Version [whisper-large](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-large-model.tar.gz)
 - GPU/DRIVER Information Tesla V100-SXM2-32GB/460.32.03
 - CUDA/CUDNN Version cuda-10.2/cuDNN Version-8.2

@zxcd 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[S2T] Whisper ASR Model excution got ValueError #2818

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[S2T] Whisper ASR Model excution got ValueError #2818

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions