import base64 import json import time from typing import Any from utils.config import settings from utils.logger import logger CONTENT_TYPE_BY_CODEC = { "mp3": "audio/mpeg", "wav": "audio/wav", "pcm": "audio/pcm", } class TencentSpeechService: def __init__(self): self.config = settings.speech def _ensure_available(self): if not self.config.enabled: raise ValueError("语音服务未启用") if self.config.provider != "tencent_cloud": raise ValueError(f"暂不支持的语音服务商: {self.config.provider}") credentials = self.config.tencent if not credentials.app_id or not credentials.secret_id or not credentials.secret_key: raise ValueError("腾讯云语音服务凭证未配置") def _load_tencent_sdk(self): try: from tencentcloud.common import credential from tencentcloud.common.exception.tencent_cloud_sdk_exception import ( TencentCloudSDKException, ) from tencentcloud.asr.v20190614 import asr_client, models as asr_models from tencentcloud.tts.v20190823 import tts_client, models as tts_models except ImportError as exc: raise RuntimeError( "未安装腾讯云 SDK,请先安装 tencentcloud-sdk-python" ) from exc return { "credential": credential, "TencentCloudSDKException": TencentCloudSDKException, "asr_client": asr_client, "asr_models": asr_models, "tts_client": tts_client, "tts_models": tts_models, } def _build_credentials(self, credential_module): cfg = self.config.tencent return credential_module.Credential(cfg.secret_id, cfg.secret_key) @staticmethod def _read_upload_bytes(upload_file) -> bytes: source = getattr(upload_file, "file", None) if source is None: raise ValueError("未读取到音频文件") if hasattr(source, "seek"): source.seek(0) data = source.read() if hasattr(source, "seek"): source.seek(0) if not data: raise ValueError("音频文件为空") return data def transcribe_file(self, upload_file, user_id: str = "", request_id: str | None = None) -> dict[str, Any]: self._ensure_available() if not getattr(self.config.transcribe, "enabled", True): raise ValueError("语音转文字未启用") audio_bytes = self._read_upload_bytes(upload_file) max_size_bytes = self.config.transcribe.max_audio_size_mb * 1024 * 1024 if len(audio_bytes) > max_size_bytes: raise ValueError( f"音频文件超过 {self.config.transcribe.max_audio_size_mb}MB,当前版本请缩短录音后重试" ) sdk = self._load_tencent_sdk() credentials = self._build_credentials(sdk["credential"]) client = sdk["asr_client"].AsrClient(credentials, self.config.tencent.region) models = sdk["asr_models"] usr_audio_key = request_id or f"{user_id or 'user'}-{int(time.time() * 1000)}" params = { "ProjectId": 0, "SubServiceType": 2, "EngSerViceType": self.config.transcribe.engine_model_type, "SourceType": self.config.transcribe.source_type, "VoiceFormat": self.config.transcribe.voice_format, "UsrAudioKey": usr_audio_key, "Data": base64.b64encode(audio_bytes).decode("utf-8"), "DataLen": len(audio_bytes), "WordInfo": self.config.transcribe.word_info, "FilterDirty": self.config.transcribe.filter_dirty, "FilterModal": self.config.transcribe.filter_modal, "FilterPunc": self.config.transcribe.filter_punc, "ConvertNumMode": self.config.transcribe.convert_num_mode, } if self.config.transcribe.hotword_id: params["HotwordId"] = self.config.transcribe.hotword_id elif self.config.transcribe.hotword_list: params["HotwordList"] = self.config.transcribe.hotword_list if self.config.transcribe.customization_id: params["CustomizationId"] = self.config.transcribe.customization_id if self.config.transcribe.replace_text_id: params["ReplaceTextId"] = self.config.transcribe.replace_text_id req = models.SentenceRecognitionRequest() req.from_json_string(json.dumps(params)) try: resp = client.SentenceRecognition(req) text = getattr(resp, "Result", "") or "" request_id = getattr(resp, "RequestId", "") if not text.strip(): raise RuntimeError("腾讯云语音识别未返回文本内容") return { "text": text.strip(), "request_id": request_id, "raw": resp.to_json_string(), } except sdk["TencentCloudSDKException"] as exc: logger.error("[speech] Tencent ASR failed: %s", exc) raise RuntimeError(f"腾讯云语音识别失败: {exc}") from exc def synthesize_text( self, text: str, *, session_id: str | None = None, voice_type: int | None = None, speed: float | None = None, volume: float | None = None, ) -> dict[str, Any]: self._ensure_available() if not getattr(self.config.synthesize, "enabled", True): raise ValueError("语音播报未启用") normalized_text = (text or "").strip() if not normalized_text: raise ValueError("播报文本不能为空") if len(normalized_text) > self.config.synthesize.basic_text_limit_chars: raise ValueError( f"播报文本超过 {self.config.synthesize.basic_text_limit_chars} 字,请先分段后重试" ) sdk = self._load_tencent_sdk() credentials = self._build_credentials(sdk["credential"]) client = sdk["tts_client"].TtsClient(credentials, self.config.tencent.region) models = sdk["tts_models"] codec = self.config.synthesize.codec.lower() params = { "Text": normalized_text, "SessionId": session_id or f"speech-{int(time.time() * 1000)}", "ProjectId": 0, "ModelType": 1, "Volume": volume if volume is not None else self.config.synthesize.volume, "Speed": speed if speed is not None else self.config.synthesize.speed, "VoiceType": voice_type if voice_type is not None else self.config.synthesize.voice_type, "SampleRate": self.config.synthesize.sample_rate, "Codec": codec, "PrimaryLanguage": self.config.synthesize.primary_language, "EnableSubtitle": self.config.synthesize.enable_subtitle, } req = models.TextToVoiceRequest() req.from_json_string(json.dumps(params)) try: resp = client.TextToVoice(req) audio_base64 = getattr(resp, "Audio", "") or "" request_id = getattr(resp, "RequestId", "") if not audio_base64: raise RuntimeError("腾讯云语音合成未返回音频内容") return { "audio_bytes": base64.b64decode(audio_base64), "content_type": CONTENT_TYPE_BY_CODEC.get(codec, "application/octet-stream"), "request_id": request_id, "raw": resp.to_json_string(), } except sdk["TencentCloudSDKException"] as exc: logger.error("[speech] Tencent TTS failed: %s", exc) raise RuntimeError(f"腾讯云语音合成失败: {exc}") from exc tencent_speech_service = TencentSpeechService()