| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275 |
- """
- AI语音相关的数据传输对象定义
- 定义语音合成、语音识别、声音复刻和音色管理API的请求和响应数据结构
- 需求: 6.1-6.13, 8.1
- """
- from typing import List, Optional, Literal, Union
- from pydantic import BaseModel, Field, field_validator
- # ==================== TTS相关Schema ====================
- class TTSRequest(BaseModel):
- """语音合成请求"""
- model: str = Field(..., description="语音合成模型:cosyvoice-v3-flash、cosyvoice-v3-plus、cosyvoice-v2")
- voice: str = Field(..., description="音色ID,系统音色或复刻音色")
- text: str = Field(..., description="待合成文本,单次不超过2000字符")
- stream: bool = Field(default=False, description="是否使用流式输出")
- format: str = Field(default="mp3", description="音频格式:mp3、wav、pcm、opus")
- sample_rate: int = Field(default=22050, description="采样率:8000、16000、22050、24000、44100、48000")
- volume: int = Field(default=50, ge=0, le=100, description="音量,范围 [0, 100]")
- speech_rate: float = Field(default=1.0, ge=0.5, le=2.0, description="语速,范围 [0.5, 2.0]")
- pitch_rate: float = Field(default=1.0, ge=0.5, le=2.0, description="音高,范围 [0.5, 2.0]")
- instruction: Optional[str] = Field(default=None, description="指令设置(情感、场景等),仅部分音色支持")
- @field_validator('text')
- @classmethod
- def validate_text(cls, v):
- if not v or not v.strip():
- raise ValueError('文本不能为空')
- return v
- @field_validator('format')
- @classmethod
- def validate_format(cls, v):
- allowed_formats = ['mp3', 'wav', 'pcm', 'opus']
- if v not in allowed_formats:
- raise ValueError(f'不支持的音频格式,允许的格式: {allowed_formats}')
- return v
- @field_validator('sample_rate')
- @classmethod
- def validate_sample_rate(cls, v):
- allowed_rates = [8000, 16000, 22050, 24000, 44100, 48000]
- if v not in allowed_rates:
- raise ValueError(f'不支持的采样率,允许的采样率: {allowed_rates}')
- return v
- class TTSResponse(BaseModel):
- """语音合成响应(非流式)"""
- audio_url: str = Field(..., description="OSS上的音频文件URL")
- duration: float = Field(..., description="音频时长(秒)")
- format: str = Field(..., description="音频格式")
- sample_rate: int = Field(..., description="采样率")
- characters: int = Field(..., description="合成的字符数")
- class LongTTSResponse(BaseModel):
- """长文本语音合成响应"""
- audio_url: str = Field(..., description="OSS上的音频文件URL")
- duration: float = Field(..., description="音频总时长(秒)")
- format: str = Field(..., description="音频格式")
- total_characters: int = Field(..., description="总字符数")
- segments: int = Field(..., description="切割的片段数")
- class TTSModelResponse(BaseModel):
- """TTS模型信息响应"""
- id: int = Field(..., description="模型ID")
- title: str = Field(..., description="模型标识")
- name: str = Field(..., description="模型名称")
- description: str = Field(..., description="模型描述")
- price: str = Field(..., description="价格信息")
- features: List[str] = Field(default=[], description="功能特性列表")
- # ==================== ASR相关Schema ====================
- class ASRRequest(BaseModel):
- """同步语音识别请求"""
- model: str = Field(..., description="识别模型:qwen3-asr-flash、qwen-audio-asr")
- audio_url: Optional[str] = Field(default=None, description="音频文件URL(与audio_base64二选一)")
- audio_base64: Optional[str] = Field(default=None, description="Base64编码的音频数据(与audio_url二选一)")
- language: Optional[str] = Field(default=None, description="指定语种:zh、en、ja、ko等,不指定则自动检测")
- enable_itn: bool = Field(default=False, description="是否启用逆文本标准化(仅中英文)")
- context: Optional[str] = Field(default=None, description="上下文提示,提升特定场景识别准确率")
- @field_validator('audio_url', 'audio_base64')
- @classmethod
- def validate_audio_source(cls, v, info):
- # 验证逻辑在模型级别处理
- return v
- class ASRUsage(BaseModel):
- """ASR使用统计"""
- input_tokens: int = Field(default=0, description="输入Token数")
- output_tokens: int = Field(default=0, description="输出Token数")
- seconds: int = Field(default=0, description="音频时长(秒)")
- class ASRResponse(BaseModel):
- """同步语音识别响应"""
- text: str = Field(..., description="识别文本")
- language: str = Field(..., description="检测到的语言")
- emotion: Optional[str] = Field(default=None, description="情感类型")
- duration: int = Field(..., description="音频时长(秒)")
- usage: ASRUsage = Field(..., description="使用统计")
- class TranscribeRequest(BaseModel):
- """异步转写请求"""
- model: str = Field(..., description="识别模型:qwen3-asr-flash-filetrans")
- file_url: str = Field(..., description="音频文件URL,必须公网可访问")
- language: Optional[str] = Field(default=None, description="指定语种")
- enable_itn: bool = Field(default=False, description="是否启用ITN")
- context: Optional[str] = Field(default=None, description="上下文提示")
- channel_id: List[int] = Field(default=[0], description="多音轨文件的音轨索引")
- class TranscriptSentence(BaseModel):
- """转写句子"""
- begin_time: int = Field(..., description="开始时间(毫秒)")
- end_time: int = Field(..., description="结束时间(毫秒)")
- text: str = Field(..., description="句子文本")
- sentence_id: int = Field(..., description="句子ID")
- language: Optional[str] = Field(default=None, description="语言")
- emotion: Optional[str] = Field(default=None, description="情感")
- class TranscriptChannel(BaseModel):
- """转写音轨结果"""
- channel_id: int = Field(..., description="音轨ID")
- text: str = Field(..., description="完整文本")
- sentences: List[TranscriptSentence] = Field(default=[], description="句子列表")
- class TranscribeResult(BaseModel):
- """转写结果"""
- transcription_url: Optional[str] = Field(default=None, description="结果文件URL")
- transcripts: List[TranscriptChannel] = Field(default=[], description="转写结果列表")
- class TaskUsage(BaseModel):
- """任务使用统计"""
- seconds: int = Field(default=0, description="音频时长(秒)")
- class TaskResponse(BaseModel):
- """异步任务响应"""
- task_id: str = Field(..., description="任务ID")
- task_status: str = Field(..., description="任务状态:PENDING、RUNNING、SUCCEEDED、FAILED、UNKNOWN")
- submit_time: Optional[str] = Field(default=None, description="提交时间")
- scheduled_time: Optional[str] = Field(default=None, description="调度时间")
- end_time: Optional[str] = Field(default=None, description="完成时间")
- result: Optional[TranscribeResult] = Field(default=None, description="转写结果")
- usage: Optional[TaskUsage] = Field(default=None, description="使用统计")
- error_message: Optional[str] = Field(default=None, description="失败时的错误信息")
- class ASRModelResponse(BaseModel):
- """ASR模型信息响应"""
- id: int = Field(..., description="模型ID")
- title: str = Field(..., description="模型标识")
- name: str = Field(..., description="模型名称")
- description: str = Field(..., description="模型描述")
- call_type: str = Field(..., description="调用方式:sync、async")
- features: List[str] = Field(default=[], description="功能特性列表")
- # ==================== 声音复刻相关Schema ====================
- class VoiceCreateRequest(BaseModel):
- """创建复刻音色请求"""
- target_model: str = Field(..., description="驱动音色的语音合成模型:cosyvoice-v3-plus、cosyvoice-v3-flash、cosyvoice-v2")
- prefix: str = Field(..., max_length=10, description="音色名称前缀,仅允许数字、字母和下划线,不超过10字符")
- voice_name: Optional[str] = Field(default=None, max_length=50, description="音色名称(用户输入的中文名称)")
- audio_url: Optional[str] = Field(default=None, description="音频文件URL(与file二选一)")
- language_hints: Optional[List[str]] = Field(default=None, description="语言提示:en、fr、de、ja、ko、ru")
- @field_validator('prefix')
- @classmethod
- def validate_prefix(cls, v):
- import re
- if not re.match(r'^[a-zA-Z0-9_]+$', v):
- raise ValueError('前缀仅允许数字、字母和下划线')
- return v
- @field_validator('target_model')
- @classmethod
- def validate_target_model(cls, v):
- allowed_models = ['cosyvoice-v3-plus', 'cosyvoice-v3-flash', 'cosyvoice-v2', 'cosyvoice-clone-v1']
- if v not in allowed_models:
- raise ValueError(f'不支持的目标模型,允许的模型: {allowed_models}')
- return v
- class VoiceUpdateRequest(BaseModel):
- """更新音色请求"""
- audio_url: Optional[str] = Field(default=None, description="新的音频文件URL(与file二选一)")
- class VoiceResponse(BaseModel):
- """音色信息响应"""
- voice_id: str = Field(..., description="音色ID")
- status: str = Field(..., description="音色状态:DEPLOYING、OK、UNDEPLOYED")
- target_model: Optional[str] = Field(default=None, description="目标模型")
- voice_name: Optional[str] = Field(default=None, description="音色名称(用户输入的中文名称)")
- resource_link: Optional[str] = Field(default=None, description="音频资源链接")
- gmt_create: Optional[str] = Field(default=None, description="创建时间")
- gmt_modified: Optional[str] = Field(default=None, description="修改时间")
- class VoiceListResponse(BaseModel):
- """音色列表响应"""
- total: int = Field(..., description="总数")
- voices: List[VoiceResponse] = Field(default=[], description="音色列表")
- # ==================== 系统音色相关Schema ====================
- class SystemVoiceFeatures(BaseModel):
- """系统音色功能特性"""
- ssml: bool = Field(default=False, description="是否支持SSML")
- instruct: bool = Field(default=False, description="是否支持Instruct")
- timestamp: bool = Field(default=False, description="是否支持时间戳")
- class SystemVoiceResponse(BaseModel):
- """系统音色响应"""
- voice_id: str = Field(..., description="音色ID")
- name: str = Field(..., description="音色名称")
- trait: str = Field(..., description="音色特质")
- age: str = Field(..., description="年龄段")
- category: str = Field(..., description="场景分类")
- languages: List[str] = Field(default=[], description="支持的语言列表")
- models: List[str] = Field(default=[], description="支持的模型列表")
- features: SystemVoiceFeatures = Field(..., description="功能特性")
- class SystemVoiceListRequest(BaseModel):
- """系统音色列表请求参数"""
- model: Optional[str] = Field(default=None, description="按模型筛选")
- category: Optional[str] = Field(default=None, description="按场景分类筛选")
- # ==================== 创作历史相关Schema ====================
- class AudioHistoryItem(BaseModel):
- """创作历史单条记录"""
- id: Union[int, str] = Field(..., description="记录ID(可能是数字或带前缀的字符串如 'sync-1')")
- name: str = Field(..., description="展示名称(文件名或文本摘要)")
- custom_name: Optional[str] = Field(default=None, description="用户自定义名称")
- mode: str = Field(..., description="创作模式,例如:声音合成、语音识别等")
- duration: Optional[float] = Field(default=None, description="音频时长(秒),仅对音频类记录有效")
- characters: Optional[int] = Field(default=None, description="文本字符数,仅对文本类记录有效")
- status: str = Field(..., description="状态:已完成、生成中等")
- audio_url: Optional[str] = Field(default=None, description="音频文件URL(如有)")
- created_at: str = Field(..., description="创建时间(ISO格式)")
- completed_at: Optional[str] = Field(default=None, description="完成时间(ISO格式)")
- # 识别历史相关字段(可选)
- recognition_text: Optional[str] = Field(default=None, description="识别文本内容(仅识别历史)")
- language: Optional[str] = Field(default=None, description="识别出的语言(仅识别历史)")
- class AudioHistoryListResponse(BaseModel):
- """创作历史列表响应"""
- total: int = Field(..., description="总记录数")
- items: List[AudioHistoryItem] = Field(default=[], description="记录列表")
- class UpdateAudioNameRequest(BaseModel):
- """更新音频名称请求"""
- custom_name: str = Field(..., max_length=200, description="自定义名称")
|