stt.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. from typing import Dict
  2. from django.utils.translation import gettext as _
  3. from langchain_core.messages import HumanMessage
  4. from langchain_google_genai import ChatGoogleGenerativeAI
  5. from common.config.tokenizer_manage_config import TokenizerManage
  6. from models_provider.base_model_provider import MaxKBBaseModel
  7. from models_provider.impl.base_stt import BaseSpeechToText
  8. def custom_get_token_ids(text: str):
  9. tokenizer = TokenizerManage.get_tokenizer()
  10. return tokenizer.encode(text)
  11. class GeminiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
  12. api_key: str
  13. model: str
  14. def __init__(self, **kwargs):
  15. super().__init__(**kwargs)
  16. self.api_key = kwargs.get('api_key')
  17. @staticmethod
  18. def is_cache_model():
  19. return False
  20. @staticmethod
  21. def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
  22. optional_params = {}
  23. if 'max_tokens' in model_kwargs and model_kwargs['max_tokens'] is not None:
  24. optional_params['max_tokens'] = model_kwargs['max_tokens']
  25. if 'temperature' in model_kwargs and model_kwargs['temperature'] is not None:
  26. optional_params['temperature'] = model_kwargs['temperature']
  27. return GeminiSpeechToText(
  28. model=model_name,
  29. api_key=model_credential.get('api_key'),
  30. **optional_params,
  31. )
  32. def check_auth(self):
  33. client = ChatGoogleGenerativeAI(
  34. model=self.model,
  35. google_api_key=self.api_key
  36. )
  37. response_list = client.invoke(_('Hello'))
  38. # print(response_list)
  39. def speech_to_text(self, audio_file):
  40. client = ChatGoogleGenerativeAI(
  41. model=self.model,
  42. google_api_key=self.api_key
  43. )
  44. audio_data = audio_file.read()
  45. system_instruction = """You are a professional speech-to-text assistant. Your task is to:
  46. 1. Transcribe the audio content accurately into text
  47. 2. Output ONLY the transcribed text without any additional comments..."""
  48. msg = HumanMessage(content=[
  49. {'type': 'text', 'text': system_instruction},
  50. {"type": "media", 'mime_type': 'audio/mp3', "data": audio_data}
  51. ])
  52. res = client.invoke([msg])
  53. if isinstance(res.content, list):
  54. for item in res.content:
  55. if isinstance(item, dict) and 'text' in item:
  56. return item['text'].strip()
  57. elif hasattr(item, 'text'):
  58. return item.text.strip()
  59. return ''
  60. elif isinstance(res.content, dict):
  61. return res.content.get('text', '').strip()
  62. else:
  63. return str(res.content).strip() if res.content else ''