tencent_speech_service.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. import base64
  2. import json
  3. import time
  4. from typing import Any
  5. from utils.config import settings
  6. from utils.logger import logger
  7. CONTENT_TYPE_BY_CODEC = {
  8. "mp3": "audio/mpeg",
  9. "wav": "audio/wav",
  10. "pcm": "audio/pcm",
  11. }
  12. class TencentSpeechService:
  13. def __init__(self):
  14. self.config = settings.speech
  15. def _ensure_available(self):
  16. if not self.config.enabled:
  17. raise ValueError("语音服务未启用")
  18. if self.config.provider != "tencent_cloud":
  19. raise ValueError(f"暂不支持的语音服务商: {self.config.provider}")
  20. credentials = self.config.tencent
  21. if not credentials.app_id or not credentials.secret_id or not credentials.secret_key:
  22. raise ValueError("腾讯云语音服务凭证未配置")
  23. def _load_tencent_sdk(self):
  24. try:
  25. from tencentcloud.common import credential
  26. from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
  27. TencentCloudSDKException,
  28. )
  29. from tencentcloud.asr.v20190614 import asr_client, models as asr_models
  30. from tencentcloud.tts.v20190823 import tts_client, models as tts_models
  31. except ImportError as exc:
  32. raise RuntimeError(
  33. "未安装腾讯云 SDK,请先安装 tencentcloud-sdk-python"
  34. ) from exc
  35. return {
  36. "credential": credential,
  37. "TencentCloudSDKException": TencentCloudSDKException,
  38. "asr_client": asr_client,
  39. "asr_models": asr_models,
  40. "tts_client": tts_client,
  41. "tts_models": tts_models,
  42. }
  43. def _build_credentials(self, credential_module):
  44. cfg = self.config.tencent
  45. return credential_module.Credential(cfg.secret_id, cfg.secret_key)
  46. @staticmethod
  47. def _read_upload_bytes(upload_file) -> bytes:
  48. source = getattr(upload_file, "file", None)
  49. if source is None:
  50. raise ValueError("未读取到音频文件")
  51. if hasattr(source, "seek"):
  52. source.seek(0)
  53. data = source.read()
  54. if hasattr(source, "seek"):
  55. source.seek(0)
  56. if not data:
  57. raise ValueError("音频文件为空")
  58. return data
  59. def transcribe_file(self, upload_file, user_id: str = "", request_id: str | None = None) -> dict[str, Any]:
  60. self._ensure_available()
  61. if not getattr(self.config.transcribe, "enabled", True):
  62. raise ValueError("语音转文字未启用")
  63. audio_bytes = self._read_upload_bytes(upload_file)
  64. max_size_bytes = self.config.transcribe.max_audio_size_mb * 1024 * 1024
  65. if len(audio_bytes) > max_size_bytes:
  66. raise ValueError(
  67. f"音频文件超过 {self.config.transcribe.max_audio_size_mb}MB,当前版本请缩短录音后重试"
  68. )
  69. sdk = self._load_tencent_sdk()
  70. credentials = self._build_credentials(sdk["credential"])
  71. client = sdk["asr_client"].AsrClient(credentials, self.config.tencent.region)
  72. models = sdk["asr_models"]
  73. usr_audio_key = request_id or f"{user_id or 'user'}-{int(time.time() * 1000)}"
  74. params = {
  75. "ProjectId": 0,
  76. "SubServiceType": 2,
  77. "EngSerViceType": self.config.transcribe.engine_model_type,
  78. "SourceType": self.config.transcribe.source_type,
  79. "VoiceFormat": self.config.transcribe.voice_format,
  80. "UsrAudioKey": usr_audio_key,
  81. "Data": base64.b64encode(audio_bytes).decode("utf-8"),
  82. "DataLen": len(audio_bytes),
  83. "WordInfo": self.config.transcribe.word_info,
  84. "FilterDirty": self.config.transcribe.filter_dirty,
  85. "FilterModal": self.config.transcribe.filter_modal,
  86. "FilterPunc": self.config.transcribe.filter_punc,
  87. "ConvertNumMode": self.config.transcribe.convert_num_mode,
  88. }
  89. if self.config.transcribe.hotword_id:
  90. params["HotwordId"] = self.config.transcribe.hotword_id
  91. elif self.config.transcribe.hotword_list:
  92. params["HotwordList"] = self.config.transcribe.hotword_list
  93. if self.config.transcribe.customization_id:
  94. params["CustomizationId"] = self.config.transcribe.customization_id
  95. if self.config.transcribe.replace_text_id:
  96. params["ReplaceTextId"] = self.config.transcribe.replace_text_id
  97. req = models.SentenceRecognitionRequest()
  98. req.from_json_string(json.dumps(params))
  99. try:
  100. resp = client.SentenceRecognition(req)
  101. text = getattr(resp, "Result", "") or ""
  102. request_id = getattr(resp, "RequestId", "")
  103. if not text.strip():
  104. raise RuntimeError("腾讯云语音识别未返回文本内容")
  105. return {
  106. "text": text.strip(),
  107. "request_id": request_id,
  108. "raw": resp.to_json_string(),
  109. }
  110. except sdk["TencentCloudSDKException"] as exc:
  111. logger.error("[speech] Tencent ASR failed: %s", exc)
  112. raise RuntimeError(f"腾讯云语音识别失败: {exc}") from exc
  113. def synthesize_text(
  114. self,
  115. text: str,
  116. *,
  117. session_id: str | None = None,
  118. voice_type: int | None = None,
  119. speed: float | None = None,
  120. volume: float | None = None,
  121. ) -> dict[str, Any]:
  122. self._ensure_available()
  123. if not getattr(self.config.synthesize, "enabled", True):
  124. raise ValueError("语音播报未启用")
  125. normalized_text = (text or "").strip()
  126. if not normalized_text:
  127. raise ValueError("播报文本不能为空")
  128. if len(normalized_text) > self.config.synthesize.basic_text_limit_chars:
  129. raise ValueError(
  130. f"播报文本超过 {self.config.synthesize.basic_text_limit_chars} 字,请先分段后重试"
  131. )
  132. sdk = self._load_tencent_sdk()
  133. credentials = self._build_credentials(sdk["credential"])
  134. client = sdk["tts_client"].TtsClient(credentials, self.config.tencent.region)
  135. models = sdk["tts_models"]
  136. codec = self.config.synthesize.codec.lower()
  137. params = {
  138. "Text": normalized_text,
  139. "SessionId": session_id or f"speech-{int(time.time() * 1000)}",
  140. "ProjectId": 0,
  141. "ModelType": 1,
  142. "Volume": volume if volume is not None else self.config.synthesize.volume,
  143. "Speed": speed if speed is not None else self.config.synthesize.speed,
  144. "VoiceType": voice_type if voice_type is not None else self.config.synthesize.voice_type,
  145. "SampleRate": self.config.synthesize.sample_rate,
  146. "Codec": codec,
  147. "PrimaryLanguage": self.config.synthesize.primary_language,
  148. "EnableSubtitle": self.config.synthesize.enable_subtitle,
  149. }
  150. req = models.TextToVoiceRequest()
  151. req.from_json_string(json.dumps(params))
  152. try:
  153. resp = client.TextToVoice(req)
  154. audio_base64 = getattr(resp, "Audio", "") or ""
  155. request_id = getattr(resp, "RequestId", "")
  156. if not audio_base64:
  157. raise RuntimeError("腾讯云语音合成未返回音频内容")
  158. return {
  159. "audio_bytes": base64.b64decode(audio_base64),
  160. "content_type": CONTENT_TYPE_BY_CODEC.get(codec, "application/octet-stream"),
  161. "request_id": request_id,
  162. "raw": resp.to_json_string(),
  163. }
  164. except sdk["TencentCloudSDKException"] as exc:
  165. logger.error("[speech] Tencent TTS failed: %s", exc)
  166. raise RuntimeError(f"腾讯云语音合成失败: {exc}") from exc
  167. tencent_speech_service = TencentSpeechService()