tts.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. from typing import Dict
  2. from openai import AzureOpenAI
  3. from common.config.tokenizer_manage_config import TokenizerManage
  4. from common.utils.common import _remove_empty_lines
  5. from models_provider.base_model_provider import MaxKBBaseModel
  6. from models_provider.impl.base_tts import BaseTextToSpeech
  7. def custom_get_token_ids(text: str):
  8. tokenizer = TokenizerManage.get_tokenizer()
  9. return tokenizer.encode(text)
  10. class AzureOpenAITextToSpeech(MaxKBBaseModel, BaseTextToSpeech):
  11. api_base: str
  12. api_key: str
  13. api_version: str
  14. model: str
  15. params: dict
  16. def __init__(self, **kwargs):
  17. super().__init__(**kwargs)
  18. self.api_key = kwargs.get('api_key')
  19. self.api_base = kwargs.get('api_base')
  20. self.api_version = kwargs.get('api_version')
  21. self.model = kwargs.get('model')
  22. self.params = kwargs.get('params')
  23. @staticmethod
  24. def is_cache_model():
  25. return False
  26. @staticmethod
  27. def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
  28. optional_params = {'params': {'voice': 'alloy'}}
  29. for key, value in model_kwargs.items():
  30. if key not in ['model_id', 'use_local', 'streaming']:
  31. optional_params['params'][key] = value
  32. return AzureOpenAITextToSpeech(
  33. model=model_name,
  34. api_base=model_credential.get('api_base'),
  35. api_key=model_credential.get('api_key'),
  36. api_version=model_credential.get('api_version'),
  37. **optional_params,
  38. )
  39. def check_auth(self):
  40. client = AzureOpenAI(
  41. azure_endpoint=self.api_base,
  42. api_key=self.api_key,
  43. api_version=self.api_version
  44. )
  45. response_list = client.models.with_raw_response.list()
  46. # print(response_list)
  47. def text_to_speech(self, text):
  48. client = AzureOpenAI(
  49. azure_endpoint=self.api_base,
  50. api_key=self.api_key,
  51. api_version=self.api_version
  52. )
  53. text = _remove_empty_lines(text)
  54. with client.audio.speech.with_streaming_response.create(
  55. model=self.model,
  56. input=text,
  57. **self.params
  58. ) as response:
  59. return response.read()