audio_schema.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. """
  2. AI语音相关的数据传输对象定义
  3. 定义语音合成、语音识别、声音复刻和音色管理API的请求和响应数据结构
  4. 需求: 6.1-6.13, 8.1
  5. """
  6. from typing import List, Optional, Literal, Union
  7. from pydantic import BaseModel, Field, field_validator
  8. # ==================== TTS相关Schema ====================
  9. class TTSRequest(BaseModel):
  10. """语音合成请求"""
  11. model: str = Field(..., description="语音合成模型:cosyvoice-v3-flash、cosyvoice-v3-plus、cosyvoice-v2")
  12. voice: str = Field(..., description="音色ID,系统音色或复刻音色")
  13. text: str = Field(..., description="待合成文本,单次不超过2000字符")
  14. stream: bool = Field(default=False, description="是否使用流式输出")
  15. format: str = Field(default="mp3", description="音频格式:mp3、wav、pcm、opus")
  16. sample_rate: int = Field(default=22050, description="采样率:8000、16000、22050、24000、44100、48000")
  17. volume: int = Field(default=50, ge=0, le=100, description="音量,范围 [0, 100]")
  18. speech_rate: float = Field(default=1.0, ge=0.5, le=2.0, description="语速,范围 [0.5, 2.0]")
  19. pitch_rate: float = Field(default=1.0, ge=0.5, le=2.0, description="音高,范围 [0.5, 2.0]")
  20. instruction: Optional[str] = Field(default=None, description="指令设置(情感、场景等),仅部分音色支持")
  21. @field_validator('text')
  22. @classmethod
  23. def validate_text(cls, v):
  24. if not v or not v.strip():
  25. raise ValueError('文本不能为空')
  26. return v
  27. @field_validator('format')
  28. @classmethod
  29. def validate_format(cls, v):
  30. allowed_formats = ['mp3', 'wav', 'pcm', 'opus']
  31. if v not in allowed_formats:
  32. raise ValueError(f'不支持的音频格式,允许的格式: {allowed_formats}')
  33. return v
  34. @field_validator('sample_rate')
  35. @classmethod
  36. def validate_sample_rate(cls, v):
  37. allowed_rates = [8000, 16000, 22050, 24000, 44100, 48000]
  38. if v not in allowed_rates:
  39. raise ValueError(f'不支持的采样率,允许的采样率: {allowed_rates}')
  40. return v
  41. class TTSResponse(BaseModel):
  42. """语音合成响应(非流式)"""
  43. audio_url: str = Field(..., description="OSS上的音频文件URL")
  44. duration: float = Field(..., description="音频时长(秒)")
  45. format: str = Field(..., description="音频格式")
  46. sample_rate: int = Field(..., description="采样率")
  47. characters: int = Field(..., description="合成的字符数")
  48. class LongTTSResponse(BaseModel):
  49. """长文本语音合成响应"""
  50. audio_url: str = Field(..., description="OSS上的音频文件URL")
  51. duration: float = Field(..., description="音频总时长(秒)")
  52. format: str = Field(..., description="音频格式")
  53. total_characters: int = Field(..., description="总字符数")
  54. segments: int = Field(..., description="切割的片段数")
  55. class TTSModelResponse(BaseModel):
  56. """TTS模型信息响应"""
  57. id: int = Field(..., description="模型ID")
  58. title: str = Field(..., description="模型标识")
  59. name: str = Field(..., description="模型名称")
  60. description: str = Field(..., description="模型描述")
  61. price: str = Field(..., description="价格信息")
  62. features: List[str] = Field(default=[], description="功能特性列表")
  63. # ==================== ASR相关Schema ====================
  64. class ASRRequest(BaseModel):
  65. """同步语音识别请求"""
  66. model: str = Field(..., description="识别模型:qwen3-asr-flash、qwen-audio-asr")
  67. audio_url: Optional[str] = Field(default=None, description="音频文件URL(与audio_base64二选一)")
  68. audio_base64: Optional[str] = Field(default=None, description="Base64编码的音频数据(与audio_url二选一)")
  69. language: Optional[str] = Field(default=None, description="指定语种:zh、en、ja、ko等,不指定则自动检测")
  70. enable_itn: bool = Field(default=False, description="是否启用逆文本标准化(仅中英文)")
  71. context: Optional[str] = Field(default=None, description="上下文提示,提升特定场景识别准确率")
  72. @field_validator('audio_url', 'audio_base64')
  73. @classmethod
  74. def validate_audio_source(cls, v, info):
  75. # 验证逻辑在模型级别处理
  76. return v
  77. class ASRUsage(BaseModel):
  78. """ASR使用统计"""
  79. input_tokens: int = Field(default=0, description="输入Token数")
  80. output_tokens: int = Field(default=0, description="输出Token数")
  81. seconds: int = Field(default=0, description="音频时长(秒)")
  82. class ASRResponse(BaseModel):
  83. """同步语音识别响应"""
  84. text: str = Field(..., description="识别文本")
  85. language: str = Field(..., description="检测到的语言")
  86. emotion: Optional[str] = Field(default=None, description="情感类型")
  87. duration: int = Field(..., description="音频时长(秒)")
  88. usage: ASRUsage = Field(..., description="使用统计")
  89. class TranscribeRequest(BaseModel):
  90. """异步转写请求"""
  91. model: str = Field(..., description="识别模型:qwen3-asr-flash-filetrans")
  92. file_url: str = Field(..., description="音频文件URL,必须公网可访问")
  93. language: Optional[str] = Field(default=None, description="指定语种")
  94. enable_itn: bool = Field(default=False, description="是否启用ITN")
  95. context: Optional[str] = Field(default=None, description="上下文提示")
  96. channel_id: List[int] = Field(default=[0], description="多音轨文件的音轨索引")
  97. class TranscriptSentence(BaseModel):
  98. """转写句子"""
  99. begin_time: int = Field(..., description="开始时间(毫秒)")
  100. end_time: int = Field(..., description="结束时间(毫秒)")
  101. text: str = Field(..., description="句子文本")
  102. sentence_id: int = Field(..., description="句子ID")
  103. language: Optional[str] = Field(default=None, description="语言")
  104. emotion: Optional[str] = Field(default=None, description="情感")
  105. class TranscriptChannel(BaseModel):
  106. """转写音轨结果"""
  107. channel_id: int = Field(..., description="音轨ID")
  108. text: str = Field(..., description="完整文本")
  109. sentences: List[TranscriptSentence] = Field(default=[], description="句子列表")
  110. class TranscribeResult(BaseModel):
  111. """转写结果"""
  112. transcription_url: Optional[str] = Field(default=None, description="结果文件URL")
  113. transcripts: List[TranscriptChannel] = Field(default=[], description="转写结果列表")
  114. class TaskUsage(BaseModel):
  115. """任务使用统计"""
  116. seconds: int = Field(default=0, description="音频时长(秒)")
  117. class TaskResponse(BaseModel):
  118. """异步任务响应"""
  119. task_id: str = Field(..., description="任务ID")
  120. task_status: str = Field(..., description="任务状态:PENDING、RUNNING、SUCCEEDED、FAILED、UNKNOWN")
  121. submit_time: Optional[str] = Field(default=None, description="提交时间")
  122. scheduled_time: Optional[str] = Field(default=None, description="调度时间")
  123. end_time: Optional[str] = Field(default=None, description="完成时间")
  124. result: Optional[TranscribeResult] = Field(default=None, description="转写结果")
  125. usage: Optional[TaskUsage] = Field(default=None, description="使用统计")
  126. error_message: Optional[str] = Field(default=None, description="失败时的错误信息")
  127. class ASRModelResponse(BaseModel):
  128. """ASR模型信息响应"""
  129. id: int = Field(..., description="模型ID")
  130. title: str = Field(..., description="模型标识")
  131. name: str = Field(..., description="模型名称")
  132. description: str = Field(..., description="模型描述")
  133. call_type: str = Field(..., description="调用方式:sync、async")
  134. features: List[str] = Field(default=[], description="功能特性列表")
  135. # ==================== 声音复刻相关Schema ====================
  136. class VoiceCreateRequest(BaseModel):
  137. """创建复刻音色请求"""
  138. target_model: str = Field(..., description="驱动音色的语音合成模型:cosyvoice-v3-plus、cosyvoice-v3-flash、cosyvoice-v2")
  139. prefix: str = Field(..., max_length=10, description="音色名称前缀,仅允许数字、字母和下划线,不超过10字符")
  140. voice_name: Optional[str] = Field(default=None, max_length=50, description="音色名称(用户输入的中文名称)")
  141. audio_url: Optional[str] = Field(default=None, description="音频文件URL(与file二选一)")
  142. language_hints: Optional[List[str]] = Field(default=None, description="语言提示:en、fr、de、ja、ko、ru")
  143. @field_validator('prefix')
  144. @classmethod
  145. def validate_prefix(cls, v):
  146. import re
  147. if not re.match(r'^[a-zA-Z0-9_]+$', v):
  148. raise ValueError('前缀仅允许数字、字母和下划线')
  149. return v
  150. @field_validator('target_model')
  151. @classmethod
  152. def validate_target_model(cls, v):
  153. allowed_models = ['cosyvoice-v3-plus', 'cosyvoice-v3-flash', 'cosyvoice-v2', 'cosyvoice-clone-v1']
  154. if v not in allowed_models:
  155. raise ValueError(f'不支持的目标模型,允许的模型: {allowed_models}')
  156. return v
  157. class VoiceUpdateRequest(BaseModel):
  158. """更新音色请求"""
  159. audio_url: Optional[str] = Field(default=None, description="新的音频文件URL(与file二选一)")
  160. class VoiceResponse(BaseModel):
  161. """音色信息响应"""
  162. voice_id: str = Field(..., description="音色ID")
  163. status: str = Field(..., description="音色状态:DEPLOYING、OK、UNDEPLOYED")
  164. target_model: Optional[str] = Field(default=None, description="目标模型")
  165. voice_name: Optional[str] = Field(default=None, description="音色名称(用户输入的中文名称)")
  166. resource_link: Optional[str] = Field(default=None, description="音频资源链接")
  167. gmt_create: Optional[str] = Field(default=None, description="创建时间")
  168. gmt_modified: Optional[str] = Field(default=None, description="修改时间")
  169. class VoiceListResponse(BaseModel):
  170. """音色列表响应"""
  171. total: int = Field(..., description="总数")
  172. voices: List[VoiceResponse] = Field(default=[], description="音色列表")
  173. # ==================== 系统音色相关Schema ====================
  174. class SystemVoiceFeatures(BaseModel):
  175. """系统音色功能特性"""
  176. ssml: bool = Field(default=False, description="是否支持SSML")
  177. instruct: bool = Field(default=False, description="是否支持Instruct")
  178. timestamp: bool = Field(default=False, description="是否支持时间戳")
  179. class SystemVoiceResponse(BaseModel):
  180. """系统音色响应"""
  181. voice_id: str = Field(..., description="音色ID")
  182. name: str = Field(..., description="音色名称")
  183. trait: str = Field(..., description="音色特质")
  184. age: str = Field(..., description="年龄段")
  185. category: str = Field(..., description="场景分类")
  186. languages: List[str] = Field(default=[], description="支持的语言列表")
  187. models: List[str] = Field(default=[], description="支持的模型列表")
  188. features: SystemVoiceFeatures = Field(..., description="功能特性")
  189. class SystemVoiceListRequest(BaseModel):
  190. """系统音色列表请求参数"""
  191. model: Optional[str] = Field(default=None, description="按模型筛选")
  192. category: Optional[str] = Field(default=None, description="按场景分类筛选")
  193. # ==================== 创作历史相关Schema ====================
  194. class AudioHistoryItem(BaseModel):
  195. """创作历史单条记录"""
  196. id: Union[int, str] = Field(..., description="记录ID(可能是数字或带前缀的字符串如 'sync-1')")
  197. name: str = Field(..., description="展示名称(文件名或文本摘要)")
  198. custom_name: Optional[str] = Field(default=None, description="用户自定义名称")
  199. mode: str = Field(..., description="创作模式,例如:声音合成、语音识别等")
  200. duration: Optional[float] = Field(default=None, description="音频时长(秒),仅对音频类记录有效")
  201. characters: Optional[int] = Field(default=None, description="文本字符数,仅对文本类记录有效")
  202. status: str = Field(..., description="状态:已完成、生成中等")
  203. audio_url: Optional[str] = Field(default=None, description="音频文件URL(如有)")
  204. created_at: str = Field(..., description="创建时间(ISO格式)")
  205. completed_at: Optional[str] = Field(default=None, description="完成时间(ISO格式)")
  206. # 识别历史相关字段(可选)
  207. recognition_text: Optional[str] = Field(default=None, description="识别文本内容(仅识别历史)")
  208. language: Optional[str] = Field(default=None, description="识别出的语言(仅识别历史)")
  209. class AudioHistoryListResponse(BaseModel):
  210. """创作历史列表响应"""
  211. total: int = Field(..., description="总记录数")
  212. items: List[AudioHistoryItem] = Field(default=[], description="记录列表")
  213. class UpdateAudioNameRequest(BaseModel):
  214. """更新音频名称请求"""
  215. custom_name: str = Field(..., max_length=200, description="自定义名称")