Ver Fonte

修复模型回复截断问题和数据集预览问题

lxylxy123321 há 1 semana atrás
pai
commit
504bd93e60

+ 12 - 14
backend/app/services/dataset_service.py

@@ -32,11 +32,14 @@ META_SIZE_THRESHOLD = 500
 
 def _is_training_data_file(path: Path) -> bool:
     """判断文件是否可能是训练数据文件(而非配置/元数据)。"""
+    if path.name in META_FILENAMES:
+        return False
     if path.suffix in (".jsonl", ".parquet", ".csv"):
+        # 小文件可能是元数据(如 ModelScope CLI 生成的 data.jsonl 只有几十字节)
+        if path.stat().st_size < META_SIZE_THRESHOLD:
+            return False
         return True
     if path.suffix == ".json":
-        if path.name in META_FILENAMES:
-            return False
         # 小 JSON 文件通常是配置
         if path.stat().st_size < META_SIZE_THRESHOLD:
             return False
@@ -50,6 +53,11 @@ def _is_training_data_file(path: Path) -> bool:
                              "text", "completion", "source", "target", "query", "response"}
                 if data_keys & set(obj.keys()):
                     return True
+                # 如果只有 framework/task/model_type 等字段,则是元数据
+                meta_keys = {"framework", "task", "license", "base_model", "model_type",
+                             "language", "domains", "tags", "authors"}
+                if meta_keys & set(obj.keys()):
+                    return False
             return True  # 大 JSON 文件默认是数据
         except Exception:
             return False
@@ -139,18 +147,8 @@ def _download_modelscope_dataset(dataset_id: str) -> tuple[Path, Path, int]:
             raise ValueError(f"No JSON/JSONL data files found in dataset {dataset_id}. "
                              f"Available files: {[f.name for f in all_files]}")
 
-    # 优先取 train / data 开头的文件
-    target = None
-    for name in ("train.jsonl", "train.json", "data.jsonl", "data.json"):
-        for f in data_files:
-            if f.name == name:
-                target = f
-                break
-        if target:
-            break
-    if not target:
-        # 优先取数据量最大的文件
-        target = sorted(data_files, key=lambda f: f.stat().st_size, reverse=True)[0]
+    # 按文件大小排序,取最大的文件作为训练数据(真正的数据集通常是最大的)
+    target = sorted(data_files, key=lambda f: f.stat().st_size, reverse=True)[0]
 
     logger.info(f"Selected data file: {target} (size={target.stat().st_size})")
 

+ 25 - 29
backend/app/services/model_test_service.py

@@ -17,8 +17,7 @@ async def test_model(model_id: str, prompt: str, max_new_tokens: int = 128, temp
 async def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> dict[str, Any]:
     """在算力节点容器内执行模型测试(通过 SSH + docker exec)。
 
-    方案:通过 SSH 在远端容器内直接执行 Python 单行命令,
-    所有参数通过环境变量传入,避免任何引号/转义问题。
+    方案:将 Python 脚本写入容器临时文件执行,避免 stdin 管道缓冲区限制。
     """
     import base64
     import json
@@ -33,10 +32,16 @@ async def _test_model_remote(model_id: str, prompt: str, max_new_tokens: int, te
     do_sample = str(temperature > 0).lower()
 
     # 独立脚本:零 app/db 依赖,参数全部通过环境变量传入
+    # 开头通过 OS 级别重定向 fd 1 到 /dev/null,抑制 C 层调试输出
+    # 最后恢复 fd 1 以打印 JSON
     script = rf"""\
-import warnings, json, os, base64, sys
+import os, sys, json, warnings, base64
+# 保存原始 fd 1(docker exec 的 stdout pipe),然后重定向到 /dev/null
+_orig_fd1 = os.dup(1)
+_devnull = os.open(os.devnull, os.O_WRONLY)
+os.dup2(_devnull, 1)
+os.close(_devnull)
 warnings.filterwarnings('ignore')
-warnings.filterwarnings('ignore', category=FutureWarning)
 os.environ['PYTHONWARNINGS'] = 'ignore'
 os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
 os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
@@ -47,7 +52,6 @@ tf_logging.set_verbosity_error()
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
 
 def find_model_path(model_id):
-    # 远端实际存储路径(与 model_service.resolve_model_path 一致)
     for base in [
         '/root/Fine-tuning/backend/data/models',
         '/root/.cache/huggingface/hub',
@@ -57,14 +61,11 @@ def find_model_path(model_id):
         bp = Path(base)
         if not bp.is_dir():
             continue
-        # 尝试 namespace_name 扁平化匹配(HF 风格)
         flat_name = model_id.replace("/", "_")
         if (bp / flat_name / "config.json").exists():
             return str(bp / flat_name)
-        # 尝试 namespace/name 嵌套匹配(ModelScope 风格)
         if (bp / model_id / "config.json").exists():
             return str(bp / model_id)
-        # 扫描所有目录
         try:
             for child in bp.rglob("config.json"):
                 if child.parent.is_dir():
@@ -82,23 +83,12 @@ do_sample = os.environ.get('DO_SAMPLE', 'true').lower() == 'true'
 
 model_path = find_model_path(model_id)
 if model_path is None:
-    print(json.dumps({{'error': f'Model not found in cache: {{model_id}}'}}))
+    sys.stderr.write(json.dumps({{'error': f'Model not found: {{model_id}}'}}) + '\\n')
     exit(1)
 
-# 抑制模型加载时的调试输出(Qwen3.5 等模型会直接 print 到 stdout)
-_original_stdout = sys.stdout
-
-class _SilentStdout:
-    def write(self, *args, **kwargs):
-        pass
-    def flush(self, *args, **kwargs):
-        pass
-
-sys.stdout = _SilentStdout()
 t = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 t.pad_token = t.pad_token or t.eos_token
 
-# 判断 accelerate 是否可用,决定加载策略
 has_accelerate = False
 try:
     import accelerate
@@ -124,12 +114,14 @@ for cls, kw in [(AutoModelForCausalLM, {{'trust_remote_code': True}}), (AutoMode
     if m is not None:
         break
 
-sys.stdout = _original_stdout
-
 if m is None:
-    print(json.dumps({{'error': 'Unable to load model', 'details': load_errors}}))
+    sys.stderr.write(json.dumps({{'error': 'Unable to load model', 'details': load_errors}}) + '\\n')
     exit(1)
 
+# 恢复 fd 1 到原始 stdout(docker exec 的 pipe)
+os.dup2(_orig_fd1, 1)
+os.close(_orig_fd1)
+
 m.eval()
 device = next(m.parameters()).device
 inp = t(prompt, return_tensors='pt').to(device)
@@ -140,20 +132,24 @@ print(json.dumps({{'generated_text': gen}}))
 
     script_b64 = base64.b64encode(script.encode()).decode()
 
-    # 通过环境变量传递参数,脚本通过 stdin 管道传入容器内的 Python
-    remote_cmd = (
-        f"echo {script_b64} | base64 -d | "
-        f"docker exec -i -w {workdir} "
+    # 先将脚本写入容器内的临时文件,再执行,避免 echo | pipe 的缓冲区限制
+    script_path = f"/tmp/test_model_{model_id.replace('/', '_')}.py"
+    write_cmd = (
+        f"echo {script_b64} | base64 -d > {script_path} && "
+        f"docker exec -w {workdir} "
         f"-e MODEL_ID={model_id} "
         f"-e PROMPT_B64={prompt_b64} "
         f"-e MAX_TOKENS={max_new_tokens} "
         f"-e TEMPERATURE={temperature} "
         f"-e TOP_P={top_p} "
         f"-e DO_SAMPLE={do_sample} "
-        f"{container} {python}"
+        f"{container} {python} {script_path}"
     )
 
-    code, stdout, stderr = ssh_exec(remote_cmd, timeout=600)
+    code, stdout, stderr = ssh_exec(write_cmd, timeout=600)
+
+    # 清理临时文件
+    ssh_exec(f"docker exec {container} rm -f {script_path}", timeout=5)
 
     logger.info(f"Remote test result: code={code}, stdout_len={len(stdout)}, stderr_len={len(stderr)}")
     if stdout:

+ 5 - 9
result.txt

@@ -1,6 +1,5 @@
-finetune-backend  | INFO:     172.20.0.4:38278 - "GET /api/v1/models/ HTTP/1.0" 200 OK
-finetune-backend  | 2026-05-20 02:43:37 | INFO     | peft-platform | Remote test result: code=0, stdout_len=1793, stderr_len=2305
-finetune-backend  | 2026-05-20 02:43:37 | INFO     | peft-platform | stdout (first 500): 1,7,16,128,128,64,1,1,None
+finetune-backend  | 2026-05-20 02:49:41 | INFO     | peft-platform | Remote test result: code=0, stdout_len=1544, stderr_len=2373
+finetune-backend  | 2026-05-20 02:49:41 | INFO     | peft-platform | stdout (first 500): 1,7,16,128,128,64,1,1,None
 finetune-backend  | 1,7,16,128,128,64,1,1,None
 finetune-backend  | 1,7,16,128,128,64,1,1,None
 finetune-backend  | 1,7,16,128,128,64,1,1,None
@@ -19,13 +18,10 @@ finetune-backend  | 1,7,16,128,128,64,1,1,None
 finetune-backend  | 1,7,16,128,128,64,1,1,None
 finetune-backend  | 1,7,16,128,128,64,1,1,None
 finetune-backend  | {"generated_te
-finetune-backend  | 2026-05-20 02:43:37 | INFO     | peft-platform | stderr (first 500): Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
+finetune-backend  | 2026-05-20 02:49:41 | INFO     | peft-platform | stderr (first 500): Current Triton version 3.0.0 is below the recommended 3.2.0 version. Errors may occur and these issues will not be fixed. Please consider upgrading Triton.
 finetune-backend  | Current Python version 3.10 is below the recommended 3.11 version. It is recommended to upgrade to Python 3.11 or higher for the best experience.
 finetune-backend  | torch.compile is not available in Python 3.10, using identity decorator instead
 finetune-backend  | 
 finetune-backend  | Loading weights:   0%|          | 0/320 [00:00<?, ?it/s]
-finetune-backend  | Loading weights:   0%|          | 1/320 [00:01<10:04,  1.89s
-finetune-backend  | INFO:     172.20.0.4:38294 - "POST /api/v1/models/test HTTP/1.0" 200 OK
-finetune-backend  | INFO:     127.0.0.1:46336 - "GET /health HTTP/1.1" 200 OK
-finetune-backend  | INFO:     127.0.0.1:45404 - "GET /health HTTP/1.1" 200 OK
-finetune-backend  | INFO:     127.0.0.1:42132 - "GET /health HTTP/1.1" 200 OK
+finetune-backend  | Loading weights:   0%|          | 1/320 [00:02<11:37,  2.19s
+finetune-backend  | INFO:     172.20.0.4:51666 - "POST /api/v1/models/test HTTP/1.0" 200 OK