""" AI语音相关的数据传输对象定义 定义语音合成、语音识别、声音复刻和音色管理API的请求和响应数据结构 需求: 6.1-6.13, 8.1 """ from typing import List, Optional, Literal, Union from pydantic import BaseModel, Field, field_validator # ==================== TTS相关Schema ==================== class TTSRequest(BaseModel): """语音合成请求""" model: str = Field(..., description="语音合成模型:cosyvoice-v3-flash、cosyvoice-v3-plus、cosyvoice-v2") voice: str = Field(..., description="音色ID,系统音色或复刻音色") text: str = Field(..., description="待合成文本,单次不超过2000字符") stream: bool = Field(default=False, description="是否使用流式输出") format: str = Field(default="mp3", description="音频格式:mp3、wav、pcm、opus") sample_rate: int = Field(default=22050, description="采样率:8000、16000、22050、24000、44100、48000") volume: int = Field(default=50, ge=0, le=100, description="音量,范围 [0, 100]") speech_rate: float = Field(default=1.0, ge=0.5, le=2.0, description="语速,范围 [0.5, 2.0]") pitch_rate: float = Field(default=1.0, ge=0.5, le=2.0, description="音高,范围 [0.5, 2.0]") instruction: Optional[str] = Field(default=None, description="指令设置(情感、场景等),仅部分音色支持") @field_validator('text') @classmethod def validate_text(cls, v): if not v or not v.strip(): raise ValueError('文本不能为空') return v @field_validator('format') @classmethod def validate_format(cls, v): allowed_formats = ['mp3', 'wav', 'pcm', 'opus'] if v not in allowed_formats: raise ValueError(f'不支持的音频格式,允许的格式: {allowed_formats}') return v @field_validator('sample_rate') @classmethod def validate_sample_rate(cls, v): allowed_rates = [8000, 16000, 22050, 24000, 44100, 48000] if v not in allowed_rates: raise ValueError(f'不支持的采样率,允许的采样率: {allowed_rates}') return v class TTSResponse(BaseModel): """语音合成响应(非流式)""" audio_url: str = Field(..., description="OSS上的音频文件URL") duration: float = Field(..., description="音频时长(秒)") format: str = Field(..., description="音频格式") sample_rate: int = Field(..., description="采样率") characters: int = Field(..., description="合成的字符数") class LongTTSResponse(BaseModel): """长文本语音合成响应""" audio_url: str = Field(..., description="OSS上的音频文件URL") duration: float = Field(..., description="音频总时长(秒)") format: str = Field(..., description="音频格式") total_characters: int = Field(..., description="总字符数") segments: int = Field(..., description="切割的片段数") class TTSModelResponse(BaseModel): """TTS模型信息响应""" id: int = Field(..., description="模型ID") title: str = Field(..., description="模型标识") name: str = Field(..., description="模型名称") description: str = Field(..., description="模型描述") price: str = Field(..., description="价格信息") features: List[str] = Field(default=[], description="功能特性列表") # ==================== ASR相关Schema ==================== class ASRRequest(BaseModel): """同步语音识别请求""" model: str = Field(..., description="识别模型:qwen3-asr-flash、qwen-audio-asr") audio_url: Optional[str] = Field(default=None, description="音频文件URL(与audio_base64二选一)") audio_base64: Optional[str] = Field(default=None, description="Base64编码的音频数据(与audio_url二选一)") language: Optional[str] = Field(default=None, description="指定语种:zh、en、ja、ko等,不指定则自动检测") enable_itn: bool = Field(default=False, description="是否启用逆文本标准化(仅中英文)") context: Optional[str] = Field(default=None, description="上下文提示,提升特定场景识别准确率") @field_validator('audio_url', 'audio_base64') @classmethod def validate_audio_source(cls, v, info): # 验证逻辑在模型级别处理 return v class ASRUsage(BaseModel): """ASR使用统计""" input_tokens: int = Field(default=0, description="输入Token数") output_tokens: int = Field(default=0, description="输出Token数") seconds: int = Field(default=0, description="音频时长(秒)") class ASRResponse(BaseModel): """同步语音识别响应""" text: str = Field(..., description="识别文本") language: str = Field(..., description="检测到的语言") emotion: Optional[str] = Field(default=None, description="情感类型") duration: int = Field(..., description="音频时长(秒)") usage: ASRUsage = Field(..., description="使用统计") class TranscribeRequest(BaseModel): """异步转写请求""" model: str = Field(..., description="识别模型:qwen3-asr-flash-filetrans") file_url: str = Field(..., description="音频文件URL,必须公网可访问") language: Optional[str] = Field(default=None, description="指定语种") enable_itn: bool = Field(default=False, description="是否启用ITN") context: Optional[str] = Field(default=None, description="上下文提示") channel_id: List[int] = Field(default=[0], description="多音轨文件的音轨索引") class TranscriptSentence(BaseModel): """转写句子""" begin_time: int = Field(..., description="开始时间(毫秒)") end_time: int = Field(..., description="结束时间(毫秒)") text: str = Field(..., description="句子文本") sentence_id: int = Field(..., description="句子ID") language: Optional[str] = Field(default=None, description="语言") emotion: Optional[str] = Field(default=None, description="情感") class TranscriptChannel(BaseModel): """转写音轨结果""" channel_id: int = Field(..., description="音轨ID") text: str = Field(..., description="完整文本") sentences: List[TranscriptSentence] = Field(default=[], description="句子列表") class TranscribeResult(BaseModel): """转写结果""" transcription_url: Optional[str] = Field(default=None, description="结果文件URL") transcripts: List[TranscriptChannel] = Field(default=[], description="转写结果列表") class TaskUsage(BaseModel): """任务使用统计""" seconds: int = Field(default=0, description="音频时长(秒)") class TaskResponse(BaseModel): """异步任务响应""" task_id: str = Field(..., description="任务ID") task_status: str = Field(..., description="任务状态:PENDING、RUNNING、SUCCEEDED、FAILED、UNKNOWN") submit_time: Optional[str] = Field(default=None, description="提交时间") scheduled_time: Optional[str] = Field(default=None, description="调度时间") end_time: Optional[str] = Field(default=None, description="完成时间") result: Optional[TranscribeResult] = Field(default=None, description="转写结果") usage: Optional[TaskUsage] = Field(default=None, description="使用统计") error_message: Optional[str] = Field(default=None, description="失败时的错误信息") class ASRModelResponse(BaseModel): """ASR模型信息响应""" id: int = Field(..., description="模型ID") title: str = Field(..., description="模型标识") name: str = Field(..., description="模型名称") description: str = Field(..., description="模型描述") call_type: str = Field(..., description="调用方式:sync、async") features: List[str] = Field(default=[], description="功能特性列表") # ==================== 声音复刻相关Schema ==================== class VoiceCreateRequest(BaseModel): """创建复刻音色请求""" target_model: str = Field(..., description="驱动音色的语音合成模型:cosyvoice-v3-plus、cosyvoice-v3-flash、cosyvoice-v2") prefix: str = Field(..., max_length=10, description="音色名称前缀,仅允许数字、字母和下划线,不超过10字符") voice_name: Optional[str] = Field(default=None, max_length=50, description="音色名称(用户输入的中文名称)") audio_url: Optional[str] = Field(default=None, description="音频文件URL(与file二选一)") language_hints: Optional[List[str]] = Field(default=None, description="语言提示:en、fr、de、ja、ko、ru") @field_validator('prefix') @classmethod def validate_prefix(cls, v): import re if not re.match(r'^[a-zA-Z0-9_]+$', v): raise ValueError('前缀仅允许数字、字母和下划线') return v @field_validator('target_model') @classmethod def validate_target_model(cls, v): allowed_models = ['cosyvoice-v3-plus', 'cosyvoice-v3-flash', 'cosyvoice-v2', 'cosyvoice-clone-v1'] if v not in allowed_models: raise ValueError(f'不支持的目标模型,允许的模型: {allowed_models}') return v class VoiceUpdateRequest(BaseModel): """更新音色请求""" audio_url: Optional[str] = Field(default=None, description="新的音频文件URL(与file二选一)") class VoiceResponse(BaseModel): """音色信息响应""" voice_id: str = Field(..., description="音色ID") status: str = Field(..., description="音色状态:DEPLOYING、OK、UNDEPLOYED") target_model: Optional[str] = Field(default=None, description="目标模型") voice_name: Optional[str] = Field(default=None, description="音色名称(用户输入的中文名称)") resource_link: Optional[str] = Field(default=None, description="音频资源链接") gmt_create: Optional[str] = Field(default=None, description="创建时间") gmt_modified: Optional[str] = Field(default=None, description="修改时间") class VoiceListResponse(BaseModel): """音色列表响应""" total: int = Field(..., description="总数") voices: List[VoiceResponse] = Field(default=[], description="音色列表") # ==================== 系统音色相关Schema ==================== class SystemVoiceFeatures(BaseModel): """系统音色功能特性""" ssml: bool = Field(default=False, description="是否支持SSML") instruct: bool = Field(default=False, description="是否支持Instruct") timestamp: bool = Field(default=False, description="是否支持时间戳") class SystemVoiceResponse(BaseModel): """系统音色响应""" voice_id: str = Field(..., description="音色ID") name: str = Field(..., description="音色名称") trait: str = Field(..., description="音色特质") age: str = Field(..., description="年龄段") category: str = Field(..., description="场景分类") languages: List[str] = Field(default=[], description="支持的语言列表") models: List[str] = Field(default=[], description="支持的模型列表") features: SystemVoiceFeatures = Field(..., description="功能特性") class SystemVoiceListRequest(BaseModel): """系统音色列表请求参数""" model: Optional[str] = Field(default=None, description="按模型筛选") category: Optional[str] = Field(default=None, description="按场景分类筛选") # ==================== 创作历史相关Schema ==================== class AudioHistoryItem(BaseModel): """创作历史单条记录""" id: Union[int, str] = Field(..., description="记录ID(可能是数字或带前缀的字符串如 'sync-1')") name: str = Field(..., description="展示名称(文件名或文本摘要)") custom_name: Optional[str] = Field(default=None, description="用户自定义名称") mode: str = Field(..., description="创作模式,例如:声音合成、语音识别等") duration: Optional[float] = Field(default=None, description="音频时长(秒),仅对音频类记录有效") characters: Optional[int] = Field(default=None, description="文本字符数,仅对文本类记录有效") status: str = Field(..., description="状态:已完成、生成中等") audio_url: Optional[str] = Field(default=None, description="音频文件URL(如有)") created_at: str = Field(..., description="创建时间(ISO格式)") completed_at: Optional[str] = Field(default=None, description="完成时间(ISO格式)") # 识别历史相关字段(可选) recognition_text: Optional[str] = Field(default=None, description="识别文本内容(仅识别历史)") language: Optional[str] = Field(default=None, description="识别出的语言(仅识别历史)") class AudioHistoryListResponse(BaseModel): """创作历史列表响应""" total: int = Field(..., description="总记录数") items: List[AudioHistoryItem] = Field(default=[], description="记录列表") class UpdateAudioNameRequest(BaseModel): """更新音频名称请求""" custom_name: str = Field(..., max_length=200, description="自定义名称")