SD-SafeAI
/
shudao-main


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
							import base64
import json
import time
from typing import Any

from utils.config import settings
from utils.logger import logger


CONTENT_TYPE_BY_CODEC = {
    "mp3": "audio/mpeg",
    "wav": "audio/wav",
    "pcm": "audio/pcm",
}


class TencentSpeechService:
    def __init__(self):
        self.config = settings.speech

    def _ensure_available(self):
        if not self.config.enabled:
            raise ValueError("语音服务未启用")
        if self.config.provider != "tencent_cloud":
            raise ValueError(f"暂不支持的语音服务商: {self.config.provider}")

        credentials = self.config.tencent
        if not credentials.app_id or not credentials.secret_id or not credentials.secret_key:
            raise ValueError("腾讯云语音服务凭证未配置")

    def _load_tencent_sdk(self):
        try:
            from tencentcloud.common import credential
            from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
                TencentCloudSDKException,
            )
            from tencentcloud.asr.v20190614 import asr_client, models as asr_models
            from tencentcloud.tts.v20190823 import tts_client, models as tts_models
        except ImportError as exc:
            raise RuntimeError(
                "未安装腾讯云 SDK，请先安装 tencentcloud-sdk-python"
            ) from exc

        return {
            "credential": credential,
            "TencentCloudSDKException": TencentCloudSDKException,
            "asr_client": asr_client,
            "asr_models": asr_models,
            "tts_client": tts_client,
            "tts_models": tts_models,
        }

    def _build_credentials(self, credential_module):
        cfg = self.config.tencent
        return credential_module.Credential(cfg.secret_id, cfg.secret_key)

    @staticmethod
    def _read_upload_bytes(upload_file) -> bytes:
        source = getattr(upload_file, "file", None)
        if source is None:
            raise ValueError("未读取到音频文件")

        if hasattr(source, "seek"):
            source.seek(0)

        data = source.read()
        if hasattr(source, "seek"):
            source.seek(0)

        if not data:
            raise ValueError("音频文件为空")

        return data

    def transcribe_file(self, upload_file, user_id: str = "", request_id: str | None = None) -> dict[str, Any]:
        self._ensure_available()

        if not getattr(self.config.transcribe, "enabled", True):
            raise ValueError("语音转文字未启用")

        audio_bytes = self._read_upload_bytes(upload_file)
        max_size_bytes = self.config.transcribe.max_audio_size_mb * 1024 * 1024
        if len(audio_bytes) > max_size_bytes:
            raise ValueError(
                f"音频文件超过 {self.config.transcribe.max_audio_size_mb}MB，当前版本请缩短录音后重试"
            )

        sdk = self._load_tencent_sdk()
        credentials = self._build_credentials(sdk["credential"])
        client = sdk["asr_client"].AsrClient(credentials, self.config.tencent.region)
        models = sdk["asr_models"]

        usr_audio_key = request_id or f"{user_id or 'user'}-{int(time.time() * 1000)}"
        params = {
            "ProjectId": 0,
            "SubServiceType": 2,
            "EngSerViceType": self.config.transcribe.engine_model_type,
            "SourceType": self.config.transcribe.source_type,
            "VoiceFormat": self.config.transcribe.voice_format,
            "UsrAudioKey": usr_audio_key,
            "Data": base64.b64encode(audio_bytes).decode("utf-8"),
            "DataLen": len(audio_bytes),
            "WordInfo": self.config.transcribe.word_info,
            "FilterDirty": self.config.transcribe.filter_dirty,
            "FilterModal": self.config.transcribe.filter_modal,
            "FilterPunc": self.config.transcribe.filter_punc,
            "ConvertNumMode": self.config.transcribe.convert_num_mode,
        }

        if self.config.transcribe.hotword_id:
            params["HotwordId"] = self.config.transcribe.hotword_id
        elif self.config.transcribe.hotword_list:
            params["HotwordList"] = self.config.transcribe.hotword_list

        if self.config.transcribe.customization_id:
            params["CustomizationId"] = self.config.transcribe.customization_id

        if self.config.transcribe.replace_text_id:
            params["ReplaceTextId"] = self.config.transcribe.replace_text_id

        req = models.SentenceRecognitionRequest()
        req.from_json_string(json.dumps(params))

        try:
            resp = client.SentenceRecognition(req)
            text = getattr(resp, "Result", "") or ""
            request_id = getattr(resp, "RequestId", "")
            if not text.strip():
                raise RuntimeError("腾讯云语音识别未返回文本内容")

            return {
                "text": text.strip(),
                "request_id": request_id,
                "raw": resp.to_json_string(),
            }
        except sdk["TencentCloudSDKException"] as exc:
            logger.error("[speech] Tencent ASR failed: %s", exc)
            raise RuntimeError(f"腾讯云语音识别失败: {exc}") from exc

    def synthesize_text(
        self,
        text: str,
        *,
        session_id: str | None = None,
        voice_type: int | None = None,
        speed: float | None = None,
        volume: float | None = None,
    ) -> dict[str, Any]:
        self._ensure_available()

        if not getattr(self.config.synthesize, "enabled", True):
            raise ValueError("语音播报未启用")

        normalized_text = (text or "").strip()
        if not normalized_text:
            raise ValueError("播报文本不能为空")

        if len(normalized_text) > self.config.synthesize.basic_text_limit_chars:
            raise ValueError(
                f"播报文本超过 {self.config.synthesize.basic_text_limit_chars} 字，请先分段后重试"
            )

        sdk = self._load_tencent_sdk()
        credentials = self._build_credentials(sdk["credential"])
        client = sdk["tts_client"].TtsClient(credentials, self.config.tencent.region)
        models = sdk["tts_models"]

        codec = self.config.synthesize.codec.lower()
        params = {
            "Text": normalized_text,
            "SessionId": session_id or f"speech-{int(time.time() * 1000)}",
            "ProjectId": 0,
            "ModelType": 1,
            "Volume": volume if volume is not None else self.config.synthesize.volume,
            "Speed": speed if speed is not None else self.config.synthesize.speed,
            "VoiceType": voice_type if voice_type is not None else self.config.synthesize.voice_type,
            "SampleRate": self.config.synthesize.sample_rate,
            "Codec": codec,
            "PrimaryLanguage": self.config.synthesize.primary_language,
            "EnableSubtitle": self.config.synthesize.enable_subtitle,
        }

        req = models.TextToVoiceRequest()
        req.from_json_string(json.dumps(params))

        try:
            resp = client.TextToVoice(req)
            audio_base64 = getattr(resp, "Audio", "") or ""
            request_id = getattr(resp, "RequestId", "")
            if not audio_base64:
                raise RuntimeError("腾讯云语音合成未返回音频内容")

            return {
                "audio_bytes": base64.b64decode(audio_base64),
                "content_type": CONTENT_TYPE_BY_CODEC.get(codec, "application/octet-stream"),
                "request_id": request_id,
                "raw": resp.to_json_string(),
            }
        except sdk["TencentCloudSDKException"] as exc:
            logger.error("[speech] Tencent TTS failed: %s", exc)
            raise RuntimeError(f"腾讯云语音合成失败: {exc}") from exc


tencent_speech_service = TencentSpeechService()