meta_registry.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. from typing import Dict, Optional
  2. from transformers import PretrainedConfig
  3. # Languages supported by Voxtral models
  4. # https://github.com/vllm-project/vllm/blob/db6f71d4c9efc4679b05311c9a8fcc594b187c06/vllm/model_executor/models/voxtral.py#L69
  5. VOXTRAL_SUPPORTED_LANGS = {
  6. "en": "English",
  7. "ar": "Arabic",
  8. "nl": "Dutch",
  9. "fr": "French",
  10. "de": "German",
  11. "hi": "Hindi",
  12. "it": "Italian",
  13. "pt": "Portuguese",
  14. "es": "Spanish",
  15. }
  16. # Languages supported by Granite-Speech models
  17. # https://github.com/vllm-project/vllm/blob/6abb0454adb531de0b081bbf65ccf907e4bd560d/vllm/model_executor/models/granite_speech.py#L80C1-L86C2
  18. GRANITE_SUPPORTED_LANGS = {
  19. "en": "English",
  20. "fr": "French",
  21. "de": "German",
  22. "pt": "Portuguese",
  23. "es": "Spanish",
  24. }
  25. # Languages supported by Whisper, GLMASR, Qwen3ASR
  26. # https://github.com/vllm-project/vllm/blob/6abb0454adb531de0b081bbf65ccf907e4bd560d/vllm/model_executor/models/whisper_utils.py#L6
  27. ISO639_1_SUPPORTED_LANGS = {
  28. "en": "English",
  29. "zh": "Chinese",
  30. "af": "Afrikaans",
  31. "ar": "Arabic",
  32. "hy": "Armenian",
  33. "az": "Azerbaijani",
  34. "be": "Belarusian",
  35. "bs": "Bosnian",
  36. "bg": "Bulgarian",
  37. "ca": "Catalan",
  38. "hr": "Croatian",
  39. "cs": "Czech",
  40. "da": "Danish",
  41. "nl": "Dutch",
  42. "et": "Estonian",
  43. "fi": "Finnish",
  44. "fr": "French",
  45. "gl": "Galician",
  46. "de": "German",
  47. "el": "Greek",
  48. "he": "Hebrew",
  49. "hi": "Hindi",
  50. "hu": "Hungarian",
  51. "is": "Icelandic",
  52. "id": "Indonesian",
  53. "it": "Italian",
  54. "ja": "Japanese",
  55. "kn": "Kannada",
  56. "kk": "Kazakh",
  57. "ko": "Korean",
  58. "lv": "Latvian",
  59. "lt": "Lithuanian",
  60. "mk": "Macedonian",
  61. "ms": "Malay",
  62. "mr": "Marathi",
  63. "mi": "Maori",
  64. "ne": "Nepali",
  65. "no": "Norwegian",
  66. "fa": "Persian",
  67. "pl": "Polish",
  68. "pt": "Portuguese",
  69. "ro": "Romanian",
  70. "ru": "Russian",
  71. "sr": "Serbian",
  72. "sk": "Slovak",
  73. "sl": "Slovenian",
  74. "es": "Spanish",
  75. "sw": "Swahili",
  76. "sv": "Swedish",
  77. "tl": "Tagalog",
  78. "ta": "Tamil",
  79. "th": "Thai",
  80. "tr": "Turkish",
  81. "uk": "Ukrainian",
  82. "ur": "Urdu",
  83. "vi": "Vietnamese",
  84. "cy": "Welsh",
  85. }
  86. # Languages supported by Qwen3-TTS
  87. # https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice/blob/main/config.json#L111
  88. QWEN3_TTS_SUPPORTED_LANGS = {
  89. "auto": "Auto",
  90. "zh": "Chinese",
  91. "en": "English",
  92. "de": "German",
  93. "it": "Italian",
  94. "pt": "Portuguese",
  95. "es": "Spanish",
  96. "ja": "Japanese",
  97. "ko": "Korean",
  98. "fr": "French",
  99. "ru": "Russian",
  100. }
  101. # Voices supported by Qwen3-TTS
  102. # https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice/blob/main/config.json#L129
  103. QWEN3_TTS_SUPPORTED_VOICES = [
  104. "Vivian",
  105. "Serena",
  106. "Uncle_Fu",
  107. "Dylan",
  108. "Eric",
  109. "Ryan",
  110. "Aiden",
  111. "Ono_Anna",
  112. "Sohee",
  113. ]
  114. def get_model_meta(pretrained_config: PretrainedConfig) -> Optional[Dict[str, any]]:
  115. """
  116. Get model meta information based on the model architectures.
  117. """
  118. if not pretrained_config:
  119. return None
  120. architectures = getattr(pretrained_config, "architectures", []) or []
  121. if not architectures:
  122. return None
  123. model_meta: dict[str, any] = {}
  124. arch_set = set(architectures)
  125. if "VoxtralForConditionalGeneration" in arch_set:
  126. model_meta["languages"] = list(VOXTRAL_SUPPORTED_LANGS.keys())
  127. elif "GraniteSpeechForConditionalGeneration" in arch_set:
  128. model_meta["languages"] = list(GRANITE_SUPPORTED_LANGS.keys())
  129. elif any(
  130. arch
  131. in {
  132. "WhisperForConditionalGeneration",
  133. "GlmAsrForConditionalGeneration",
  134. "Qwen3ASRForConditionalGeneration",
  135. }
  136. for arch in arch_set
  137. ):
  138. model_meta["languages"] = list(ISO639_1_SUPPORTED_LANGS.keys())
  139. elif "Qwen3TTSForConditionalGeneration" in arch_set:
  140. model_meta["languages"] = list(
  141. QWEN3_TTS_SUPPORTED_LANGS.values()
  142. ) # Qwen3-TTS uses full language names
  143. model_meta["voices"] = QWEN3_TTS_SUPPORTED_VOICES
  144. tts_model_type = getattr(pretrained_config, "tts_model_type", "") or ""
  145. if tts_model_type:
  146. # Options: CustomVoice, VoiceDesign, Base. Convert snake_case to CamelCase. e.g., custom_voice -> CustomVoice.
  147. model_meta["task_type"] = "".join(
  148. word.capitalize() for word in tts_model_type.split("_")
  149. )
  150. return model_meta