stt.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import io
  2. from typing import Dict
  3. from openai import AzureOpenAI
  4. from common.config.tokenizer_manage_config import TokenizerManage
  5. from models_provider.base_model_provider import MaxKBBaseModel
  6. from models_provider.impl.base_stt import BaseSpeechToText
  7. def custom_get_token_ids(text: str):
  8. tokenizer = TokenizerManage.get_tokenizer()
  9. return tokenizer.encode(text)
  10. class AzureOpenAISpeechToText(MaxKBBaseModel, BaseSpeechToText):
  11. api_base: str
  12. api_key: str
  13. api_version: str
  14. model: str
  15. params: dict
  16. def __init__(self, **kwargs):
  17. super().__init__(**kwargs)
  18. self.api_key = kwargs.get('api_key')
  19. self.api_base = kwargs.get('api_base')
  20. self.api_version = kwargs.get('api_version')
  21. self.params = kwargs.get('params')
  22. @staticmethod
  23. def is_cache_model():
  24. return False
  25. @staticmethod
  26. def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
  27. optional_params = {}
  28. if 'max_tokens' in model_kwargs and model_kwargs['max_tokens'] is not None:
  29. optional_params['max_tokens'] = model_kwargs['max_tokens']
  30. if 'temperature' in model_kwargs and model_kwargs['temperature'] is not None:
  31. optional_params['temperature'] = model_kwargs['temperature']
  32. return AzureOpenAISpeechToText(
  33. model=model_name,
  34. api_base=model_credential.get('api_base'),
  35. api_key=model_credential.get('api_key'),
  36. api_version=model_credential.get('api_version'),
  37. params=model_kwargs,
  38. **optional_params,
  39. )
  40. def check_auth(self):
  41. client = AzureOpenAI(
  42. azure_endpoint=self.api_base,
  43. api_key=self.api_key,
  44. api_version=self.api_version
  45. )
  46. response_list = client.models.with_raw_response.list()
  47. # print(response_list)
  48. def speech_to_text(self, audio_file):
  49. client = AzureOpenAI(
  50. azure_endpoint=self.api_base,
  51. api_key=self.api_key,
  52. api_version=self.api_version
  53. )
  54. audio_data = audio_file.read()
  55. buffer = io.BytesIO(audio_data)
  56. buffer.name = "file.mp3" # this is the important line
  57. filter_params = {k: v for k, v in self.params.items() if k not in {'model_id', 'use_local', 'streaming'}}
  58. transcription_params = {
  59. 'model': self.model,
  60. 'file': buffer,
  61. 'language': 'zh'
  62. }
  63. res = client.audio.transcriptions.create(**transcription_params, extra_body=filter_params)
  64. return res.text