| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204 |
- import base64
- import json
- import time
- from typing import Any
- from utils.config import settings
- from utils.logger import logger
- CONTENT_TYPE_BY_CODEC = {
- "mp3": "audio/mpeg",
- "wav": "audio/wav",
- "pcm": "audio/pcm",
- }
- class TencentSpeechService:
- def __init__(self):
- self.config = settings.speech
- def _ensure_available(self):
- if not self.config.enabled:
- raise ValueError("语音服务未启用")
- if self.config.provider != "tencent_cloud":
- raise ValueError(f"暂不支持的语音服务商: {self.config.provider}")
- credentials = self.config.tencent
- if not credentials.app_id or not credentials.secret_id or not credentials.secret_key:
- raise ValueError("腾讯云语音服务凭证未配置")
- def _load_tencent_sdk(self):
- try:
- from tencentcloud.common import credential
- from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
- TencentCloudSDKException,
- )
- from tencentcloud.asr.v20190614 import asr_client, models as asr_models
- from tencentcloud.tts.v20190823 import tts_client, models as tts_models
- except ImportError as exc:
- raise RuntimeError(
- "未安装腾讯云 SDK,请先安装 tencentcloud-sdk-python"
- ) from exc
- return {
- "credential": credential,
- "TencentCloudSDKException": TencentCloudSDKException,
- "asr_client": asr_client,
- "asr_models": asr_models,
- "tts_client": tts_client,
- "tts_models": tts_models,
- }
- def _build_credentials(self, credential_module):
- cfg = self.config.tencent
- return credential_module.Credential(cfg.secret_id, cfg.secret_key)
- @staticmethod
- def _read_upload_bytes(upload_file) -> bytes:
- source = getattr(upload_file, "file", None)
- if source is None:
- raise ValueError("未读取到音频文件")
- if hasattr(source, "seek"):
- source.seek(0)
- data = source.read()
- if hasattr(source, "seek"):
- source.seek(0)
- if not data:
- raise ValueError("音频文件为空")
- return data
- def transcribe_file(self, upload_file, user_id: str = "", request_id: str | None = None) -> dict[str, Any]:
- self._ensure_available()
- if not getattr(self.config.transcribe, "enabled", True):
- raise ValueError("语音转文字未启用")
- audio_bytes = self._read_upload_bytes(upload_file)
- max_size_bytes = self.config.transcribe.max_audio_size_mb * 1024 * 1024
- if len(audio_bytes) > max_size_bytes:
- raise ValueError(
- f"音频文件超过 {self.config.transcribe.max_audio_size_mb}MB,当前版本请缩短录音后重试"
- )
- sdk = self._load_tencent_sdk()
- credentials = self._build_credentials(sdk["credential"])
- client = sdk["asr_client"].AsrClient(credentials, self.config.tencent.region)
- models = sdk["asr_models"]
- usr_audio_key = request_id or f"{user_id or 'user'}-{int(time.time() * 1000)}"
- params = {
- "ProjectId": 0,
- "SubServiceType": 2,
- "EngSerViceType": self.config.transcribe.engine_model_type,
- "SourceType": self.config.transcribe.source_type,
- "VoiceFormat": self.config.transcribe.voice_format,
- "UsrAudioKey": usr_audio_key,
- "Data": base64.b64encode(audio_bytes).decode("utf-8"),
- "DataLen": len(audio_bytes),
- "WordInfo": self.config.transcribe.word_info,
- "FilterDirty": self.config.transcribe.filter_dirty,
- "FilterModal": self.config.transcribe.filter_modal,
- "FilterPunc": self.config.transcribe.filter_punc,
- "ConvertNumMode": self.config.transcribe.convert_num_mode,
- }
- if self.config.transcribe.hotword_id:
- params["HotwordId"] = self.config.transcribe.hotword_id
- elif self.config.transcribe.hotword_list:
- params["HotwordList"] = self.config.transcribe.hotword_list
- if self.config.transcribe.customization_id:
- params["CustomizationId"] = self.config.transcribe.customization_id
- if self.config.transcribe.replace_text_id:
- params["ReplaceTextId"] = self.config.transcribe.replace_text_id
- req = models.SentenceRecognitionRequest()
- req.from_json_string(json.dumps(params))
- try:
- resp = client.SentenceRecognition(req)
- text = getattr(resp, "Result", "") or ""
- request_id = getattr(resp, "RequestId", "")
- if not text.strip():
- raise RuntimeError("腾讯云语音识别未返回文本内容")
- return {
- "text": text.strip(),
- "request_id": request_id,
- "raw": resp.to_json_string(),
- }
- except sdk["TencentCloudSDKException"] as exc:
- logger.error("[speech] Tencent ASR failed: %s", exc)
- raise RuntimeError(f"腾讯云语音识别失败: {exc}") from exc
- def synthesize_text(
- self,
- text: str,
- *,
- session_id: str | None = None,
- voice_type: int | None = None,
- speed: float | None = None,
- volume: float | None = None,
- ) -> dict[str, Any]:
- self._ensure_available()
- if not getattr(self.config.synthesize, "enabled", True):
- raise ValueError("语音播报未启用")
- normalized_text = (text or "").strip()
- if not normalized_text:
- raise ValueError("播报文本不能为空")
- if len(normalized_text) > self.config.synthesize.basic_text_limit_chars:
- raise ValueError(
- f"播报文本超过 {self.config.synthesize.basic_text_limit_chars} 字,请先分段后重试"
- )
- sdk = self._load_tencent_sdk()
- credentials = self._build_credentials(sdk["credential"])
- client = sdk["tts_client"].TtsClient(credentials, self.config.tencent.region)
- models = sdk["tts_models"]
- codec = self.config.synthesize.codec.lower()
- params = {
- "Text": normalized_text,
- "SessionId": session_id or f"speech-{int(time.time() * 1000)}",
- "ProjectId": 0,
- "ModelType": 1,
- "Volume": volume if volume is not None else self.config.synthesize.volume,
- "Speed": speed if speed is not None else self.config.synthesize.speed,
- "VoiceType": voice_type if voice_type is not None else self.config.synthesize.voice_type,
- "SampleRate": self.config.synthesize.sample_rate,
- "Codec": codec,
- "PrimaryLanguage": self.config.synthesize.primary_language,
- "EnableSubtitle": self.config.synthesize.enable_subtitle,
- }
- req = models.TextToVoiceRequest()
- req.from_json_string(json.dumps(params))
- try:
- resp = client.TextToVoice(req)
- audio_base64 = getattr(resp, "Audio", "") or ""
- request_id = getattr(resp, "RequestId", "")
- if not audio_base64:
- raise RuntimeError("腾讯云语音合成未返回音频内容")
- return {
- "audio_bytes": base64.b64decode(audio_base64),
- "content_type": CONTENT_TYPE_BY_CODEC.get(codec, "application/octet-stream"),
- "request_id": request_id,
- "raw": resp.to_json_string(),
- }
- except sdk["TencentCloudSDKException"] as exc:
- logger.error("[speech] Tencent TTS failed: %s", exc)
- raise RuntimeError(f"腾讯云语音合成失败: {exc}") from exc
- tencent_speech_service = TencentSpeechService()
|