소스 검색

fix:修改施工方案筛选脚本

Meric 2 주 전
부모
커밋
2c6ecc451f

+ 4 - 0
.opencode

@@ -0,0 +1,4 @@
+{
+  "$schema": "https://opencode.ai/config.json",
+  "permission": "allow"
+}

+ 0 - 537
src/app/scripts/ceshi/03-施工方案筛选_服务器版.py

@@ -1,537 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-评审意见文件筛选脚本 - 服务器并发版
-
-相对原始版本的改进:
-1. 目录级并发处理(多进程),充分利用服务器CPU
-2. 无交互参数化(适合nohup/screen/任务调度)
-3. 保留断点续跑能力,周期性增量写入缓存
-4. 保持原有筛选规则:Top5优先 + 其余补充 + 两阶段页数检查
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-import multiprocessing as mp
-import os
-import re
-import shutil
-import sys
-import warnings
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import pandas as pd
-import PyPDF2
-from docx import Document
-
-warnings.filterwarnings("ignore", category=UserWarning, module="PyPDF2")
-warnings.filterwarnings("ignore", category=Warning)
-# pypdf/PyPDF2在部分PDF字体映射异常时会输出"unknown widths"噪声日志,降级为ERROR避免刷屏。
-logging.getLogger("PyPDF2").setLevel(logging.ERROR)
-logging.getLogger("pypdf").setLevel(logging.ERROR)
-
-KEYWORDS = {
-    "expert": [
-        "专家评审意见", "专家评审记录", "专家评审结论",
-        "专家评估意见", "专家评估记录", "专家评估结论",
-        "专家审查意见", "专家审查记录", "专家审查结论",
-        "专家评审说明", "专家评估说明", "专家审查说明",
-        "专家评审建议", "专家评估建议", "专家审查建议",
-        "专家评审纪要", "专家评估纪要", "专家审查纪要",
-        "专家评审报告", "专家评估报告", "专家审查报告",
-        "专家评审审核表", "专家评估审核表", "专家审查审核表",
-        "专家评审审查表", "专家评估审查表", "专家审查审查表",
-    ],
-    "company": ["公司评审意见", "集团评审意见", "施工方案审核意见"],
-}
-
-KEYWORD_PATTERNS = {
-    "expert": [
-        r"专家(评审|评估|审查).{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|审查表)",
-        r"(评审|评估|审查).{0,10}专家.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|审查表)",
-    ],
-    "company": [
-        r"(公司|集团|项目公司).{0,10}(评审|审核|审查).{0,10}(意见|说明|记录)",
-        r"施工方案.{0,10}(审核|审查|评审).{0,10}(意见|说明|记录)",
-    ],
-}
-
-
-def _extract_pdf_text_worker(pdf_path_str: str, max_pages: int, result_queue: mp.Queue) -> None:
-    try:
-        text = extract_pdf_text_core(Path(pdf_path_str), max_pages=max_pages)
-        result_queue.put({"ok": True, "text": text})
-    except Exception as e:
-        result_queue.put({"ok": False, "error": str(e)})
-
-
-def get_file_size(file_path: Path) -> int:
-    try:
-        return file_path.stat().st_size
-    except Exception:
-        return 0
-
-
-def get_file_creation_time(file_path: Path) -> float:
-    try:
-        return file_path.stat().st_ctime
-    except Exception:
-        return 0
-
-
-def normalize_text(text: str) -> str:
-    cleaned = re.sub(r"\s+", "", text)
-    cleaned = cleaned.replace("(", "(").replace(")", ")").replace(":", ":")
-    cleaned = cleaned.replace(",", ",").replace("。", ".").replace("、", "")
-    return cleaned
-
-
-def extract_pdf_text_core(pdf_path: Path, max_pages: int) -> str:
-    try:
-        with open(pdf_path, "rb") as f:
-            reader = PyPDF2.PdfReader(f)
-            pages = min(len(reader.pages), max_pages)
-            chunks: List[str] = []
-            for idx in range(pages):
-                try:
-                    t = reader.pages[idx].extract_text()
-                    if t:
-                        chunks.append(t)
-                except Exception:
-                    continue
-            return "\n".join(chunks)
-    except Exception:
-        return ""
-
-
-def extract_pdf_text(pdf_path: Path, max_pages: int, timeout_seconds: int) -> str:
-    """带超时保护的PDF提取,防止单文件卡死整个批次。"""
-    ctx = mp.get_context("spawn")
-    q: mp.Queue = ctx.Queue()
-    p = ctx.Process(target=_extract_pdf_text_worker, args=(str(pdf_path), max_pages, q))
-    p.start()
-    p.join(timeout_seconds)
-    if p.is_alive():
-        p.terminate()
-        p.join(timeout=2)
-        print(f"[WARN] PDF解析超时({timeout_seconds}s): {pdf_path}")
-        return ""
-    if q.empty():
-        return ""
-    result = q.get()
-    if not result.get("ok"):
-        return ""
-    return result.get("text", "")
-
-
-def extract_docx_text(docx_path: Path, max_pages: int) -> str:
-    try:
-        approx_max_paragraphs = max(1, max_pages * 40)
-        doc = Document(str(docx_path))
-        chunks: List[str] = []
-        for i, p in enumerate(doc.paragraphs):
-            if i >= approx_max_paragraphs:
-                break
-            if p.text:
-                chunks.append(p.text)
-        return "\n".join(chunks)
-    except Exception:
-        return ""
-
-
-def check_contains_keywords(file_path: Path, review_type: str, max_pages: int, pdf_timeout_seconds: int) -> Tuple[bool, str]:
-    suffix = file_path.suffix.lower()
-    if suffix == ".pdf":
-        text = extract_pdf_text(file_path, max_pages=max_pages, timeout_seconds=pdf_timeout_seconds)
-    elif suffix == ".docx":
-        text = extract_docx_text(file_path, max_pages=max_pages)
-    elif suffix == ".doc":
-        return False, ""
-    else:
-        return False, ""
-
-    cleaned_text = normalize_text(text)
-    keywords = KEYWORDS[review_type]
-
-    for kw in keywords:
-        if normalize_text(kw) in cleaned_text:
-            return True, kw
-
-    for pattern in KEYWORD_PATTERNS.get(review_type, []):
-        if re.search(pattern, cleaned_text, re.IGNORECASE):
-            return True, f"模式命中:{pattern}"
-    return False, ""
-
-
-def find_candidate_files(directory: Path) -> List[Path]:
-    if not directory.exists() or not directory.is_dir():
-        return []
-    files: List[Path] = []
-    for pattern in ("*.pdf", "*.docx", "*.doc"):
-        files.extend(directory.glob(pattern))
-    return files
-
-
-def get_top5_by_size(files: List[Path]) -> List[Path]:
-    return sorted(files, key=get_file_size, reverse=True)[:5]
-
-
-def get_newest_file(files: List[Path]) -> Optional[Path]:
-    if not files:
-        return None
-    return max(files, key=get_file_creation_time)
-
-
-def pick_review_file(dir_path: Path, review_type: str, max_pages: int, pdf_timeout_seconds: int) -> Tuple[bool, str, Optional[Path], List[Path], str]:
-    files = find_candidate_files(dir_path)
-    if not files:
-        return False, "目录中未找到PDF/DOCX文件", None, [], ""
-
-    print(f"[{dir_path.name}][{review_type}] 候选文件 {len(files)} 个,先检查Top5...")
-    top5 = get_top5_by_size(files)
-    matched_top5: List[Path] = []
-    matched_kw = ""
-
-    for idx, f in enumerate(top5, start=1):
-        print(f"[{dir_path.name}][{review_type}] Top5进度 {idx}/{len(top5)}: {f.name}")
-        ok, kw = check_contains_keywords(f, review_type, max_pages=max_pages, pdf_timeout_seconds=pdf_timeout_seconds)
-        if ok:
-            matched_top5.append(f)
-            if not matched_kw:
-                matched_kw = kw
-
-    if matched_top5:
-        selected = get_newest_file(matched_top5)
-        return True, "Top5命中", selected, matched_top5, matched_kw
-
-    others = [f for f in files if f not in top5]
-    matched_others: List[Path] = []
-    if others:
-        print(f"[{dir_path.name}][{review_type}] 开始检查剩余文件 {len(others)} 个...")
-    for idx, f in enumerate(others, start=1):
-        print(f"[{dir_path.name}][{review_type}] 其余进度 {idx}/{len(others)}: {f.name}")
-        ok, kw = check_contains_keywords(f, review_type, max_pages=max_pages, pdf_timeout_seconds=pdf_timeout_seconds)
-        if ok:
-            matched_others.append(f)
-            if not matched_kw:
-                matched_kw = kw
-
-    if matched_others:
-        selected = get_newest_file(matched_others)
-        return True, "其他文件命中", selected, matched_others, matched_kw
-    return False, "未找到包含关键词的文件", None, [], ""
-
-
-def process_one_directory_task(
-    dir_path_str: str,
-    phase1_pages: int,
-    phase2_pages: int,
-    do_phase2: bool,
-    pdf_timeout_seconds: int,
-) -> Dict:
-    dir_path = Path(dir_path_str)
-    dir_id = dir_path.name
-    row_results: List[Dict] = []
-    print(f"[{dir_id}] 开始处理目录")
-
-    for review_type in ("expert", "company"):
-        print(f"[{dir_id}] 开始处理类型: {review_type}")
-        ok1, msg1, selected1, all_matched1, kw1 = pick_review_file(
-            dir_path, review_type, max_pages=phase1_pages, pdf_timeout_seconds=pdf_timeout_seconds
-        )
-        if ok1 and selected1:
-            print(
-                f"[{dir_id}][{review_type}] 命中({msg1}) 选择文件: {selected1.name} 关键词: {kw1} 匹配数: {len(all_matched1)}"
-            )
-            row_results.append(
-                {
-                    "目录ID": dir_id,
-                    "评审类型": review_type,
-                    "阶段": "第1阶段",
-                    "状态": "成功",
-                    "备注": msg1,
-                    "匹配关键词": kw1,
-                    "匹配文件数": len(all_matched1),
-                    "原路径": str(selected1),
-                    "原文件名": selected1.name,
-                }
-            )
-            continue
-
-        if do_phase2:
-            ok2, msg2, selected2, all_matched2, kw2 = pick_review_file(
-                dir_path, review_type, max_pages=phase2_pages, pdf_timeout_seconds=pdf_timeout_seconds
-            )
-            if ok2 and selected2:
-                print(
-                    f"[{dir_id}][{review_type}] 命中({msg2}) 选择文件: {selected2.name} 关键词: {kw2} 匹配数: {len(all_matched2)}"
-                )
-                row_results.append(
-                    {
-                        "目录ID": dir_id,
-                        "评审类型": review_type,
-                        "阶段": "第2阶段",
-                        "状态": "成功",
-                        "备注": msg2,
-                        "匹配关键词": kw2,
-                        "匹配文件数": len(all_matched2),
-                        "原路径": str(selected2),
-                        "原文件名": selected2.name,
-                    }
-                )
-                continue
-
-            last_message = msg2
-        else:
-            last_message = msg1
-
-        print(f"[{dir_id}][{review_type}] 未找到: {last_message}")
-        row_results.append(
-            {
-                "目录ID": dir_id,
-                "评审类型": review_type,
-                "阶段": "第2阶段" if do_phase2 else "第1阶段",
-                "状态": "未找到",
-                "备注": last_message,
-                "匹配关键词": "",
-                "匹配文件数": 0,
-                "原路径": "",
-                "原文件名": "",
-            }
-        )
-
-    print(f"[{dir_id}] 目录处理结束")
-    return {"dir_id": dir_id, "results": row_results}
-
-
-def load_cache(cache_file: Path) -> Dict:
-    if cache_file.exists():
-        try:
-            return json.loads(cache_file.read_text(encoding="utf-8"))
-        except Exception:
-            pass
-    return {"processed_dirs": [], "results": [], "start_time": datetime.now().isoformat()}
-
-
-def save_cache(cache_file: Path, data: Dict) -> None:
-    cache_file.parent.mkdir(parents=True, exist_ok=True)
-    data["last_update"] = datetime.now().isoformat()
-    cache_file.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
-
-
-def copy_outputs(results: List[Dict], expert_dir: Path, company_dir: Path) -> List[Dict]:
-    expert_dir.mkdir(parents=True, exist_ok=True)
-    company_dir.mkdir(parents=True, exist_ok=True)
-    final_rows: List[Dict] = []
-
-    for row in results:
-        out = dict(row)
-        if row.get("状态") != "成功":
-            out["新文件名"] = ""
-            out["目标路径"] = ""
-            out["处理时间"] = datetime.now().isoformat()
-            final_rows.append(out)
-            continue
-
-        src = Path(row["原路径"])
-        target_dir = expert_dir if row["评审类型"] == "expert" else company_dir
-        new_name = f"{row['目录ID']}_{src.name}"
-        dst = target_dir / new_name
-        try:
-            shutil.copy2(src, dst)
-            out["新文件名"] = new_name
-            out["目标路径"] = str(dst)
-        except Exception as e:
-            out["状态"] = "复制失败"
-            out["备注"] = str(e)
-            out["新文件名"] = ""
-            out["目标路径"] = ""
-        out["处理时间"] = datetime.now().isoformat()
-        final_rows.append(out)
-    return final_rows
-
-
-def get_numeric_dirs(source_dir: Path) -> List[Path]:
-    if not source_dir.exists():
-        return []
-    dirs = [d for d in source_dir.iterdir() if d.is_dir() and d.name.isdigit()]
-    dirs.sort(key=lambda d: int(d.name))
-    return dirs
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="评审意见筛选 - 服务器并发版")
-    parser.add_argument("--source-dir", required=True, help="源目录(包含数字子目录)")
-    parser.add_argument("--expert-output-dir", required=True, help="专家评审输出目录")
-    parser.add_argument("--company-output-dir", required=True, help="公司/集团评审输出目录")
-    parser.add_argument("--temp-dir", default="./temp_server", help="缓存目录")
-    parser.add_argument("--phase1-pages", type=int, default=15, help="一级筛选页数")
-    parser.add_argument("--phase2-pages", type=int, default=30, help="二级筛选页数")
-    parser.add_argument("--disable-phase2", action="store_true", help="禁用二级筛选")
-    parser.add_argument("--workers", type=int, default=max(1, (os.cpu_count() or 4) - 1), help="并发进程数")
-    parser.add_argument("--save-every", type=int, default=20, help="每处理N个目录写一次缓存")
-    parser.add_argument("--pdf-timeout-seconds", type=int, default=45, help="单个PDF解析超时秒数")
-    parser.add_argument("--report-dir", default="", help="结果报表目录(默认使用expert输出目录的上级)")
-    parser.add_argument("--retry-failed", action="store_true", help="重跑历史状态为处理异常的目录")
-    parser.add_argument("--copy-as-you-go", action="store_true", help="每完成一个目录就复制输出文件(便于实时看到结果)")
-    return parser.parse_args()
-
-
-def main() -> None:
-    args = parse_args()
-    source_dir = Path(args.source_dir).expanduser().resolve()
-    expert_output_dir = Path(args.expert_output_dir).expanduser().resolve()
-    company_output_dir = Path(args.company_output_dir).expanduser().resolve()
-    temp_dir = Path(args.temp_dir).expanduser().resolve()
-    cache_file = temp_dir / "评审筛选进度缓存_服务器版.json"
-    report_dir = (
-        Path(args.report_dir).expanduser().resolve()
-        if args.report_dir.strip()
-        else expert_output_dir.parent
-    )
-
-    do_phase2 = not args.disable_phase2
-    copy_as_you_go = bool(getattr(args, "copy_as_you_go", False))
-
-    numeric_dirs = get_numeric_dirs(source_dir)
-    if not numeric_dirs:
-        print(f"错误: 未找到数字子目录: {source_dir}")
-        sys.exit(1)
-
-    cache = load_cache(cache_file)
-    processed = set(cache.get("processed_dirs", []))
-    all_results: List[Dict] = cache.get("results", [])
-    failed_dir_ids = {
-        str(r.get("目录ID"))
-        for r in all_results
-        if r.get("状态") == "处理异常"
-    }
-    if args.retry_failed and failed_dir_ids:
-        processed = {d for d in processed if d not in failed_dir_ids}
-    pending_dirs = [d for d in numeric_dirs if d.name not in processed]
-
-    print("=" * 70)
-    print("评审意见筛选脚本 - 服务器并发版")
-    print("=" * 70)
-    print(f"源目录: {source_dir}")
-    print(f"总目录数: {len(numeric_dirs)}")
-    print(f"待处理: {len(pending_dirs)}")
-    print(f"并发进程数: {args.workers}")
-    print(f"PDF超时秒数: {args.pdf_timeout_seconds}")
-    print(f"二级筛选: {'开启' if do_phase2 else '关闭'}")
-    print(f"缓存文件: {cache_file}")
-    print(f"报表目录: {report_dir}")
-    print(f"失败重跑: {'开启' if args.retry_failed else '关闭'}")
-    print(f"实时复制: {'开启' if copy_as_you_go else '关闭'}")
-
-    final_rows_accum: List[Dict] = []
-
-    if not pending_dirs:
-        print("无需处理,直接输出结果。")
-    else:
-        print(f"开始并发处理目录,共 {len(pending_dirs)} 个...")
-        mp_ctx = mp.get_context("spawn")
-        with ProcessPoolExecutor(max_workers=args.workers, mp_context=mp_ctx) as pool:
-            futures = {
-                pool.submit(
-                    process_one_directory_task,
-                    str(d),
-                    args.phase1_pages,
-                    args.phase2_pages,
-                    do_phase2,
-                    args.pdf_timeout_seconds,
-                ): d.name
-                for d in pending_dirs
-            }
-
-            done_count = 0
-            total = len(pending_dirs)
-            for fut in as_completed(futures):
-                done_count += 1
-                dir_id = futures[fut]
-                try:
-                    payload = fut.result()
-                    all_results.extend(payload["results"])
-                    processed.add(dir_id)
-                    if copy_as_you_go:
-                        # 只对本目录结果做复制与增强字段,避免等到全部结束才看到输出文件
-                        final_rows_accum.extend(copy_outputs(payload["results"], expert_output_dir, company_output_dir))
-                except Exception as e:
-                    all_results.append(
-                        {
-                            "目录ID": dir_id,
-                            "评审类型": "all",
-                            "阶段": "第0阶段",
-                            "状态": "处理异常",
-                            "备注": str(e),
-                            "匹配关键词": "",
-                            "匹配文件数": 0,
-                            "原路径": "",
-                            "原文件名": "",
-                        }
-                    )
-
-                if done_count % args.save_every == 0 or done_count == total:
-                    save_cache(cache_file, {"processed_dirs": sorted(processed), "results": all_results, "start_time": cache.get("start_time")})
-                print(f"[{done_count}/{total}] 完成目录 {dir_id}")
-
-    final_rows = final_rows_accum if copy_as_you_go else copy_outputs(all_results, expert_output_dir, company_output_dir)
-    df = pd.DataFrame(final_rows)
-
-    output_base = report_dir
-    output_base.mkdir(parents=True, exist_ok=True)
-    excel_file = output_base / "评审筛选结果记录_服务器版.xlsx"
-    json_file = output_base / "评审筛选统计_服务器版.json"
-    df.to_excel(excel_file, index=False, engine="openpyxl")
-
-    all_ids = {d.name for d in numeric_dirs}
-    expert_success_ids = {str(r["目录ID"]) for r in final_rows if r.get("评审类型") == "expert" and r.get("状态") == "成功"}
-    company_success_ids = {str(r["目录ID"]) for r in final_rows if r.get("评审类型") == "company" and r.get("状态") == "成功"}
-    expert_missing = sorted(list(all_ids - expert_success_ids), key=lambda x: int(x))
-    company_missing = sorted(list(all_ids - company_success_ids), key=lambda x: int(x))
-    both_missing = sorted(list(all_ids - (expert_success_ids | company_success_ids)), key=lambda x: int(x))
-
-    expert_missing_file = output_base / "无专家审查意见目录ID.txt"
-    both_missing_file = output_base / "既无专家也无公司集团审查意见目录ID.txt"
-    expert_missing_file.write_text("\n".join(expert_missing), encoding="utf-8")
-    both_missing_file.write_text("\n".join(both_missing), encoding="utf-8")
-
-    stats = {
-        "start_time": cache.get("start_time"),
-        "end_time": datetime.now().isoformat(),
-        "total_directories": len(numeric_dirs),
-        "expert_success": len(expert_success_ids),
-        "company_success": len(company_success_ids),
-        "expert_missing_count": len(expert_missing),
-        "company_missing_count": len(company_missing),
-        "both_missing_count": len(both_missing),
-        "expert_missing_ids": expert_missing,
-        "company_missing_ids": company_missing,
-        "both_missing_ids": both_missing,
-        "expert_missing_file": str(expert_missing_file),
-        "both_missing_file": str(both_missing_file),
-        "excel_file": str(excel_file),
-        "expert_output_dir": str(expert_output_dir),
-        "company_output_dir": str(company_output_dir),
-    }
-    json_file.write_text(json.dumps(stats, ensure_ascii=False, indent=2), encoding="utf-8")
-
-    save_cache(cache_file, {"processed_dirs": sorted({d.name for d in numeric_dirs}), "results": final_rows, "start_time": cache.get("start_time")})
-
-    print("=" * 70)
-    print("处理完成")
-    print(f"Excel: {excel_file}")
-    print(f"统计JSON: {json_file}")
-    print(f"无专家审查意见目录ID: {expert_missing_file}")
-    print(f"既无专家也无公司集团审查意见目录ID: {both_missing_file}")
-    print("=" * 70)
-
-
-if __name__ == "__main__":
-    main()

+ 351 - 163
src/app/scripts/ceshi/03-施工方案筛选.py → src/app/scripts/plan_select/03-施工方案筛选.py

@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-评审意见PDF文件筛选脚本 - 多进程并发版+测试模式
+评审意见PDF文件筛选脚本 - 多进程并发版+测试模式+命令行参数
 
 功能说明:
-    从raw/670目录下的数字编号子目录中筛选评审意见PDF文件。
+    从目录下的数字编号子目录中筛选评审意见PDF文件。
 
     一级筛选(默认):
     - 检查前15页是否包含"专家评审"或"公司评审"或"集团评审"
@@ -18,28 +18,32 @@
     3. 如果top5中没找到,则将范围扩大到其余文件
     4. 如果都没找到,记录为"无评审意见"
     5. 如果多份文件都找到关键词,以创建时间最新的为准
-    6. 专家评审输出到output/expert_review目录
-    7. 公司/集团评审输出到output/company_review目录
+    6. 专家评审输出到指定目录
+    7. 公司/集团评审输出到指定目录
     8. 支持断点续传,在temp目录缓存处理进度
-    9. 【新增】支持多进程并发筛选,提高效率
-    10.【新增】支持随机抽取测试模式,快速验证
+    9. 支持多进程并发筛选,提高效率
+    10. 支持随机抽取测试模式,快速验证
+    11. 支持命令行参数配置路径和参数
 
-输入:
-    - 源目录: raw/670/ (包含数字编号子目录,如1567、1569等)
-    - 子目录中的PDF文件名是UUID格式
+使用方式:
+    # 方式1:直接运行(使用文件顶部的默认路径)
+    python 03-施工方案筛选.py
 
-输出:
-    - 专家评审目录: output/expert_review/ (专家评审PDF)
-    - 公司评审目录: output/company_review/ (公司/集团评审PDF)
-    - 结果记录: output/评审筛选结果记录.xlsx
-    - 统计JSON: output/评审筛选统计.json
-    - 缓存文件: data_pipline/script/temp/评审筛选进度缓存.json
+    # 方式2:通过命令行参数指定路径
+    python 03-施工方案筛选.py --source-dir "E:/data/raw" --expert-output-dir "E:/output/expert" --company-output-dir "E:/output/company"
+
+    # 方式3:测试模式(随机抽取5个目录)
+    python 03-施工方案筛选.py --test-mode --test-sample-size 10
+
+    # 方式4:指定并发数和筛选页数
+    python 03-施工方案筛选.py --workers 8 --phase1-pages 20 --phase2-pages 50
 
 作者: Claude
 日期: 2026-04-21
-更新: 2026-05-08 - 增加多进程并发和测试模式
+更新: 2026-05-11 - 增加命令行参数支持
 """
 
+import argparse
 import pandas as pd
 import json
 import sys
@@ -59,30 +63,16 @@ from docx import Document
 warnings.filterwarnings('ignore', category=UserWarning, module='PyPDF2')
 warnings.filterwarnings('ignore', category=Warning)
 
-# ==================== 路径配置(可在文件首部直接修改)====================
-# 规则:
-# 1) 填绝对路径(如 E:/data/raw/670)则直接使用(Windows 建议用 / 或 \\)
-# 2) 填相对路径(如 ../../raw/670)则相对当前脚本目录解析
-SOURCE_DIR = r"E:\提供的原始文件\原始文件\全部的原始文档\未提取"
-EXPERT_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\专家评审意见_记录"
-COMPANY_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明"
-TEMP_DIR = "temp"
-
-
-# ==================== 并发与测试配置 ====================
-# 多进程并发配置
-NUM_WORKERS = 4  # 并发进程数,建议设为CPU核心数(如CPU有8核则设为6-8)
-                   # 注意:每个工作进程内部还会为单个PDF创建子进程(超时控制)
-                   # 因此 NUM_WORKERS 不宜过大,避免进程过多导致系统资源耗尽
-
-# 测试模式配置
-TEST_MODE = False       # 是否启用测试模式:随机抽取少量目录快速测试
-TEST_SAMPLE_SIZE = 5    # 测试模式下随机抽取的目录数量
-                        # 测试完成后会输出结果并自动退出,不会进入二级筛选
-
-
-# 分批配置(仅用于统计显示,不创建子目录)
-BATCH_SIZE = 50  # 每批处理的目录数量(仅用于进度显示)
+# ==================== 默认路径配置(命令行未指定时使用)====================
+DEFAULT_SOURCE_DIR = r"E:\提供的原始文件\原始文件\全部的原始文档\未提取"
+DEFAULT_EXPERT_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\专家评审意见_记录"
+DEFAULT_COMPANY_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明"
+DEFAULT_TEMP_DIR = "temp"
+DEFAULT_NUM_WORKERS = 4
+DEFAULT_TEST_MODE = False
+DEFAULT_TEST_SAMPLE_SIZE = 5
+DEFAULT_PHASE1_PAGES = 15
+DEFAULT_PHASE2_PAGES = 30
 
 # 关键词配置
 KEYWORDS = {
@@ -272,9 +262,125 @@ KEYWORD_PATTERNS = {
     ],
 }
 
-# 筛选阶段配置
-PHASE_1_PAGES = 15  # 一级筛选:前15页
-PHASE_2_PAGES = 30  # 二级筛选:前30页
+# 默认筛选阶段配置
+DEFAULT_PHASE1_PAGES = 15  # 一级筛选:前15页
+DEFAULT_PHASE2_PAGES = 30  # 二级筛选:前30页
+
+
+def parse_args() -> argparse.Namespace:
+    """解析命令行参数"""
+    parser = argparse.ArgumentParser(
+        description="评审意见PDF文件筛选脚本 - 多进程并发版",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+使用示例:
+  # 使用默认路径运行
+  python 03-施工方案筛选.py
+
+  # 指定输入输出路径
+  python 03-施工方案筛选.py --source-dir "E:/data/raw" --expert-output-dir "E:/output/expert" --company-output-dir "E:/output/company"
+
+  # 测试模式(随机抽取10个目录)
+  python 03-施工方案筛选.py --test-mode --test-sample-size 10
+
+  # 指定并发数和筛选页数
+  python 03-施工方案筛选.py --workers 8 --phase1-pages 20 --phase2-pages 50
+
+  # 禁用二级筛选
+  python 03-施工方案筛选.py --disable-phase2
+
+  # 重试历史处理失败的目录
+  python 03-施工方案筛选.py --retry-failed
+        """
+    )
+
+    # 路径参数
+    parser.add_argument(
+        "--source-dir",
+        default=DEFAULT_SOURCE_DIR,
+        help=f"源目录路径(包含数字编号子目录),默认: {DEFAULT_SOURCE_DIR}"
+    )
+    parser.add_argument(
+        "--expert-output-dir",
+        default=DEFAULT_EXPERT_OUTPUT_DIR,
+        help=f"专家评审输出目录,默认: {DEFAULT_EXPERT_OUTPUT_DIR}"
+    )
+    parser.add_argument(
+        "--company-output-dir",
+        default=DEFAULT_COMPANY_OUTPUT_DIR,
+        help=f"公司/集团评审输出目录,默认: {DEFAULT_COMPANY_OUTPUT_DIR}"
+    )
+    parser.add_argument(
+        "--temp-dir",
+        default=DEFAULT_TEMP_DIR,
+        help=f"缓存目录,默认: {DEFAULT_TEMP_DIR}"
+    )
+
+    # 筛选参数
+    parser.add_argument(
+        "--phase1-pages",
+        type=int,
+        default=DEFAULT_PHASE1_PAGES,
+        help=f"一级筛选检查的页数,默认: {DEFAULT_PHASE1_PAGES}"
+    )
+    parser.add_argument(
+        "--phase2-pages",
+        type=int,
+        default=DEFAULT_PHASE2_PAGES,
+        help=f"二级筛选检查的页数,默认: {DEFAULT_PHASE2_PAGES}"
+    )
+    parser.add_argument(
+        "--disable-phase2",
+        action="store_true",
+        help="禁用二级筛选(仅执行一级筛选)"
+    )
+
+    # 并发参数
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=DEFAULT_NUM_WORKERS,
+        help=f"并发进程数,默认: {DEFAULT_NUM_WORKERS}"
+    )
+
+    # 测试模式参数
+    parser.add_argument(
+        "--test-mode",
+        action="store_true",
+        help="启用测试模式(随机抽取少量目录快速验证)"
+    )
+    parser.add_argument(
+        "--test-sample-size",
+        type=int,
+        default=DEFAULT_TEST_SAMPLE_SIZE,
+        help=f"测试模式下随机抽取的目录数量,默认: {DEFAULT_TEST_SAMPLE_SIZE}"
+    )
+
+    # 其他参数
+    parser.add_argument(
+        "--pdf-timeout",
+        type=int,
+        default=30,
+        help="单个PDF解析超时秒数,默认: 30"
+    )
+    parser.add_argument(
+        "--cache-every",
+        type=int,
+        default=10,
+        help="每处理多少个目录保存一次缓存,默认: 10"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="显示详细处理日志(默认仅显示进度)"
+    )
+    parser.add_argument(
+        "--retry-failed",
+        action="store_true",
+        help="重试历史处理失败的目录(状态为'处理异常'或'复制失败'的目录)"
+    )
+
+    return parser.parse_args()
 
 
 def get_file_size(file_path: Path) -> int:
@@ -733,10 +839,10 @@ def resolve_config_path(path_value: str, script_dir: Path) -> Path:
 def process_directory_worker(args_tuple):
     """
     工作进程函数:处理单个目录的两种评审类型
-    
+
     此函数在独立的工作进程中运行,同时处理专家评审和公司评审。
     工作进程之间互不影响,各自独立复制文件到输出目录。
-    
+
     Args:
         args_tuple: (
             dir_path_str,      # 目录路径字符串
@@ -746,7 +852,7 @@ def process_directory_worker(args_tuple):
             company_output_dir_str, # 公司评审输出目录
             verbose            # 是否打印详细日志
         )
-    
+
     Returns:
         dict: {
             "dir_id": str,              # 目录ID
@@ -756,32 +862,55 @@ def process_directory_worker(args_tuple):
         }
     """
     dir_path_str, phase, max_pages, expert_output_dir_str, company_output_dir_str, verbose = args_tuple
-    
+
     dir_path = Path(dir_path_str)
     expert_output_dir = Path(expert_output_dir_str)
     company_output_dir = Path(company_output_dir_str)
     dir_id = dir_path.name
-    
+
     results = []
-    
-    # 处理专家评审
-    expert_success, expert_file, expert_result = process_review_type(
-        dir_path, phase=phase, max_pages=max_pages, 
-        review_type="expert", output_dir=expert_output_dir,
-        dir_id=dir_id, verbose=verbose
-    )
-    if expert_result:
-        results.append(expert_result)
-    
-    # 处理公司评审
-    company_success, company_file, company_result = process_review_type(
-        dir_path, phase=phase, max_pages=max_pages, 
-        review_type="company", output_dir=company_output_dir,
-        dir_id=dir_id, verbose=verbose
-    )
-    if company_result:
-        results.append(company_result)
-    
+    expert_success = False
+    company_success = False
+
+    try:
+        # 处理专家评审
+        expert_success, expert_file, expert_result = process_review_type(
+            dir_path, phase=phase, max_pages=max_pages,
+            review_type="expert", output_dir=expert_output_dir,
+            dir_id=dir_id, verbose=verbose
+        )
+        if expert_result:
+            results.append(expert_result)
+
+        # 处理公司评审
+        company_success, company_file, company_result = process_review_type(
+            dir_path, phase=phase, max_pages=max_pages,
+            review_type="company", output_dir=company_output_dir,
+            dir_id=dir_id, verbose=verbose
+        )
+        if company_result:
+            results.append(company_result)
+
+    except Exception as e:
+        # 捕获异常,记录为处理异常
+        error_result = {
+            '目录ID': dir_id,
+            '评审类型': 'all',
+            '阶段': f'第{phase}阶段',
+            '原文件名': '',
+            '新文件名': '',
+            '状态': '处理异常',
+            '匹配来源': '',
+            '匹配关键词': '',
+            '匹配文件数': 0,
+            '文件大小_MB': 0,
+            '备注': str(e),
+            '原路径': str(dir_path),
+            '目标路径': '',
+            '处理时间': datetime.now().isoformat()
+        }
+        results.append(error_result)
+
     return {
         "dir_id": dir_id,
         "expert_success": expert_success,
@@ -793,12 +922,13 @@ def process_directory_worker(args_tuple):
 def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: int,
                            expert_output_dir: Path, company_output_dir: Path,
                            temp_dir: Path, numeric_dirs: List[Path],
-                           stats: Dict, processed_dirs: Set, 
+                           stats: Dict, processed_dirs: Set,
                            phase1_no_match_expert: Set, phase1_no_match_company: Set,
+                           num_workers: int = DEFAULT_NUM_WORKERS,
                            verbose: bool = False, cache_every: int = 10) -> Tuple[Dict, Set, Set]:
     """
     并发运行一个阶段的筛选
-    
+
     Args:
         dirs_to_process: 待处理的目录列表
         phase: 阶段(1或2)
@@ -811,16 +941,17 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
         processed_dirs: 已处理目录集合(会被修改)
         phase1_no_match_expert: 一级未匹配专家评审的目录集合(会被修改)
         phase1_no_match_company: 一级未匹配公司评审的目录集合(会被修改)
+        num_workers: 并发进程数
         verbose: 工作进程是否打印详细日志
         cache_every: 每处理多少个目录保存一次缓存
-    
+
     Returns:
         (stats, phase1_no_match_expert, phase1_no_match_company)
     """
     total = len(dirs_to_process)
     completed = 0
     all_results = []
-    
+
     # 构建参数列表
     args_list = []
     for dir_path in dirs_to_process:
@@ -832,21 +963,28 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
             str(company_output_dir),
             verbose
         ))
-    
-    print(f"\n  启动 {NUM_WORKERS} 个并发进程处理 {total} 个目录...")
+
+    print(f"\n  启动 {num_workers} 个并发进程处理 {total} 个目录...")
     print(f"  工作模式: {'详细日志' if verbose else '静默模式(仅显示进度)'}\n")
-    
+
     # 使用进程池并发处理
-    with mp.Pool(processes=NUM_WORKERS) as pool:
+    with mp.Pool(processes=num_workers) as pool:
         # imap_unordered 不保证顺序,但返回速度最快
         for result in pool.imap_unordered(process_directory_worker, args_list):
-            dir_id = result["dir_id"]
-            
+            dir_id = result.get("dir_id", "unknown")
+            results_list = result.get("results", [])
+
+            # 检查是否有处理异常
+            has_error = any(r.get("状态") in ["处理异常", "复制失败"] for r in results_list)
+
             # 更新结果列表
-            all_results.extend(result["results"])
-            
+            all_results.extend(results_list)
+
             # 更新统计
-            if result["expert_success"]:
+            expert_success = result.get("expert_success", False)
+            company_success = result.get("company_success", False)
+
+            if expert_success:
                 stats["expert_success_count"] = stats.get("expert_success_count", 0) + 1
                 # 如果之前标记为未匹配,现在成功了,移除标记
                 phase1_no_match_expert.discard(dir_id)
@@ -854,25 +992,26 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
                 # 只有在一级筛选时才添加未匹配标记
                 if phase == 1:
                     phase1_no_match_expert.add(dir_id)
-            
-            if result["company_success"]:
+
+            if company_success:
                 stats["company_success_count"] = stats.get("company_success_count", 0) + 1
                 phase1_no_match_company.discard(dir_id)
             else:
                 if phase == 1:
                     phase1_no_match_company.add(dir_id)
-            
+
             # 标记为已处理
             processed_dirs.add(dir_id)
             completed += 1
-            
-            # 显示进度
+
+            # 显示进度(如果有异常则显示警告)
             progress = completed / total * 100
+            status_indicator = "⚠️" if has_error else ""
             print(f"\r  进度: {completed}/{total} ({progress:.1f}%) | "
                   f"专家成功: {stats.get('expert_success_count', 0)} | "
                   f"公司成功: {stats.get('company_success_count', 0)} | "
-                  f"当前: {dir_id}", end="", flush=True)
-            
+                  f"当前: {dir_id} {status_indicator}", end="", flush=True)
+
             # 定期保存缓存
             if completed % cache_every == 0 or completed == total:
                 cache_data = {
@@ -885,7 +1024,7 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
                     "total_directories": len(numeric_dirs)
                 }
                 save_progress_cache(temp_dir, cache_data)
-    
+
     print(f"\n\n  ✅ 阶段完成!处理 {completed} 个目录")
     print(f"     专家评审成功: {stats.get('expert_success_count', 0)} 个")
     print(f"     公司评审成功: {stats.get('company_success_count', 0)} 个")
@@ -895,28 +1034,44 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
 
 def main():
     """主函数"""
+    # 解析命令行参数
+    args = parse_args()
+
     print("=" * 70)
-    print("评审意见PDF筛选脚本 - 多进程并发版+测试模式")
+    print("评审意见PDF筛选脚本 - 多进程并发版+测试模式+命令行参数")
     print("=" * 70)
 
-    # 按文件首部配置组装路径(不再按项目根目录拼接)
+    # 按参数组装路径
     script_dir = Path(__file__).parent
-    source_base_dir = resolve_config_path(SOURCE_DIR, script_dir)
-    expert_output_dir = resolve_config_path(EXPERT_OUTPUT_DIR, script_dir)
-    company_output_dir = resolve_config_path(COMPANY_OUTPUT_DIR, script_dir)
-    temp_dir = resolve_config_path(TEMP_DIR, script_dir)
+    source_base_dir = resolve_config_path(args.source_dir, script_dir)
+    expert_output_dir = resolve_config_path(args.expert_output_dir, script_dir)
+    company_output_dir = resolve_config_path(args.company_output_dir, script_dir)
+    temp_dir = resolve_config_path(args.temp_dir, script_dir)
+
+    # 从参数获取配置
+    num_workers = args.workers
+    test_mode = args.test_mode
+    test_sample_size = args.test_sample_size
+    phase1_pages = args.phase1_pages
+    phase2_pages = args.phase2_pages
+    disable_phase2 = args.disable_phase2
+    pdf_timeout = args.pdf_timeout
+    cache_every = args.cache_every
+    verbose_mode = args.verbose
+    retry_failed = args.retry_failed
 
     print(f"\n【配置信息】")
     print(f"  源目录: {source_base_dir}")
     print(f"  专家评审输出目录: {expert_output_dir}")
     print(f"  公司评审输出目录: {company_output_dir}")
     print(f"  缓存目录: {temp_dir}")
-    print(f"  并发进程数: {NUM_WORKERS}")
-    print(f"  测试模式: {'是(抽取5个目录)' if TEST_MODE else '否'}")
-    print(f"  专家评审关键词: {KEYWORDS['expert']}")
-    print(f"  公司评审关键词: {KEYWORDS['company']}")
-    print(f"  一级筛选: 前{PHASE_1_PAGES}页")
-    print(f"  二级筛选: 前{PHASE_2_PAGES}页")
+    print(f"  并发进程数: {num_workers}")
+    print(f"  测试模式: {'是(抽取{}个目录)'.format(test_sample_size) if test_mode else '否'}")
+    print(f"  一级筛选: 前{phase1_pages}页")
+    print(f"  二级筛选: {'禁用' if disable_phase2 else '前{}页'.format(phase2_pages)}")
+    print(f"  PDF超时秒数: {pdf_timeout}")
+    print(f"  详细日志: {'是' if verbose_mode else '否'}")
+    print(f"  失败重试: {'是' if retry_failed else '否'}")
 
     # 检查 PyCryptodome 库
     try:
@@ -936,19 +1091,19 @@ def main():
         sys.exit(1)
 
     print(f"  找到 {len(numeric_dirs)} 个数字编号子目录")
-    
+
     # ==================== 测试模式:随机抽取 ====================
-    if TEST_MODE:
-        print(f"\n【测试模式】随机抽取 {TEST_SAMPLE_SIZE} 个目录进行测试...")
-        if len(numeric_dirs) <= TEST_SAMPLE_SIZE:
+    if test_mode:
+        print(f"\n【测试模式】随机抽取 {test_sample_size} 个目录进行测试...")
+        if len(numeric_dirs) <= test_sample_size:
             test_dirs = numeric_dirs
-            print(f"  目录总数不足 {TEST_SAMPLE_SIZE},测试全部 {len(numeric_dirs)} 个目录")
+            print(f"  目录总数不足 {test_sample_size},测试全部 {len(numeric_dirs)} 个目录")
         else:
             # 使用固定随机种子,确保可复现
             random.seed(42)
-            test_dirs = random.sample(numeric_dirs, TEST_SAMPLE_SIZE)
+            test_dirs = random.sample(numeric_dirs, test_sample_size)
             test_dirs.sort(key=lambda d: int(d.name))  # 按数字排序,方便查看
-        
+
         numeric_dirs = test_dirs
         print(f"  测试目录: {[d.name for d in test_dirs]}")
         # 测试模式不加载缓存,不进入二级筛选
@@ -965,26 +1120,46 @@ def main():
         # 加载缓存(断点续传)
         print(f"\n【步骤 2/6】加载进度缓存...")
         cache = load_progress_cache(temp_dir)
-    
+
     processed_dirs = set(cache.get("processed_dirs", []))
     phase1_no_match_expert = set(cache.get("phase1_no_match_expert", []))
     phase1_no_match_company = set(cache.get("phase1_no_match_company", []))
     current_phase = cache.get("phase", 1)
     all_results = cache.get("results", [])
 
-    if processed_dirs and not TEST_MODE:
+    # ==================== 失败重试机制 ====================
+    # 从历史结果中识别失败的目录
+    failed_dir_ids = set()
+    if all_results:
+        for result in all_results:
+            status = result.get("状态", "")
+            if status in ["处理异常", "复制失败", "读取失败", "解析失败"]:
+                failed_dir_ids.add(str(result.get("目录ID", "")))
+
+    if retry_failed and failed_dir_ids:
+        print(f"\n【失败重试】发现 {len(failed_dir_ids)} 个历史失败的目录:")
+        print(f"  失败目录ID: {sorted(list(failed_dir_ids), key=lambda x: int(x) if x.isdigit() else 0)[:10]}{'...' if len(failed_dir_ids) > 10 else ''}")
+        # 将失败目录从已处理列表中移除,使其重新被处理
+        processed_dirs = {d for d in processed_dirs if d not in failed_dir_ids}
+        # 同时从结果列表中移除失败记录,避免重复
+        all_results = [r for r in all_results if str(r.get("目录ID", "")) not in failed_dir_ids]
+        print(f"  已将失败目录重新加入待处理列表")
+
+    if processed_dirs and not test_mode:
         print(f"  发现缓存:")
         print(f"    - 已处理: {len(processed_dirs)} 个目录")
         print(f"    - 专家评审一级未找到: {len(phase1_no_match_expert)} 个目录")
         print(f"    - 公司评审一级未找到: {len(phase1_no_match_company)} 个目录")
         print(f"    - 当前阶段: 第{current_phase}阶段")
+        if failed_dir_ids and not retry_failed:
+            print(f"    - 历史失败目录: {len(failed_dir_ids)} 个(可通过 --retry-failed 参数重试)")
     else:
-        print(f"  {'无缓存(测试模式),将从头开始处理' if TEST_MODE else '无缓存,将从头开始处理'}")
+        print(f"  {'无缓存(测试模式),将从头开始处理' if test_mode else '无缓存,将从头开始处理'}")
 
     # ==================== 一级筛选 ====================
     if current_phase == 1:
         print(f"\n{'='*70}")
-        print("【第1阶段】一级筛选(检查前15页)")
+        print(f"【第1阶段】一级筛选(检查前{phase1_pages}页)")
         print('='*70)
 
         dirs_to_process = [d for d in numeric_dirs if d.name not in processed_dirs]
@@ -1017,19 +1192,20 @@ def main():
                     stats[key] = value
 
             # 并发处理一级筛选
-            # 在测试模式下使用详细日志(verbose=True),正式运行使用静默模式(verbose=False)
-            verbose_mode = TEST_MODE  # 测试模式打印详细日志,正式模式静默
+            # 测试模式或 verbose 参数启用时打印详细日志
+            phase1_verbose = test_mode or verbose_mode
             stats, phase1_no_match_expert, phase1_no_match_company = run_phase_concurrently(
-                dirs_to_process, phase=1, max_pages=PHASE_1_PAGES,
+                dirs_to_process, phase=1, max_pages=phase1_pages,
                 expert_output_dir=expert_output_dir, company_output_dir=company_output_dir,
                 temp_dir=temp_dir, numeric_dirs=numeric_dirs,
                 stats=stats, processed_dirs=processed_dirs,
                 phase1_no_match_expert=phase1_no_match_expert,
                 phase1_no_match_company=phase1_no_match_company,
-                verbose=verbose_mode,
-                cache_every=10
+                num_workers=num_workers,
+                verbose=phase1_verbose,
+                cache_every=cache_every
             )
-            
+
             # 更新结果列表
             cache = load_progress_cache(temp_dir)
             all_results = cache.get("results", [])
@@ -1041,7 +1217,7 @@ def main():
             print(f"  公司评审未找到: {len(phase1_no_match_company)} 个")
 
             # 测试模式下直接退出,不进行二级筛选和保存
-            if TEST_MODE:
+            if test_mode:
                 print(f"\n{'='*70}")
                 print("【测试模式完成】")
                 print(f"  共测试 {len(dirs_to_process)} 个目录")
@@ -1049,7 +1225,7 @@ def main():
                 print(f"  公司评审成功: {stats['company_success_count']} 个")
                 print(f"  测试结果已保存到缓存,可查看输出目录确认文件")
                 print("="*70)
-                
+
                 # 测试模式也保存最终结果
                 _save_final_results(
                     temp_dir, expert_output_dir, company_output_dir,
@@ -1057,51 +1233,62 @@ def main():
                 )
                 return
 
-            # 询问是否进行二级筛选
-            total_no_match = len(phase1_no_match_expert.union(phase1_no_match_company))
-            print(f"\n{'='*70}")
-            print("是否进行二级筛选?")
-            print(f"  - 专家评审未找到: {len(phase1_no_match_expert)} 个目录")
-            print(f"  - 公司评审未找到: {len(phase1_no_match_company)} 个目录")
-            print(f"  - 扩大检查范围到前{PHASE_2_PAGES}页")
-            print('='*70)
-
-            while True:
-                try:
-                    user_input = input("请输入 (y/n): ").strip().lower()
-                    if user_input in ['y', 'yes', '是']:
-                        enable_phase2 = True
-                        break
-                    elif user_input in ['n', 'no', '否']:
-                        enable_phase2 = False
-                        break
-                    else:
-                        print("  请输入 y 或 n")
-                except KeyboardInterrupt:
-                    print("\n\n用户中断,进度已保存")
-                    sys.exit(0)
-
-            if not enable_phase2:
+            # 判断是否进行二级筛选
+            # 如果参数指定禁用二级筛选,则直接跳过
+            if disable_phase2:
+                print(f"\n{'='*70}")
+                print("二级筛选已禁用(--disable-phase2 参数)")
+                print(f"  - 专家评审未找到: {len(phase1_no_match_expert)} 个目录")
+                print(f"  - 公司评审未找到: {len(phase1_no_match_company)} 个目录")
+                print('='*70)
                 print("\n  跳过二级筛选,直接保存结果...")
-                current_phase = 3  # 跳过二级筛选,直接保存
+                current_phase = 3
             else:
-                current_phase = 2
-                # 保存进入第二阶段的标记
-                cache_data = {
-                    "processed_dirs": list(processed_dirs),
-                    "phase1_no_match_expert": list(phase1_no_match_expert),
-                    "phase1_no_match_company": list(phase1_no_match_company),
-                    "results": all_results,
-                    "stats": stats,
-                    "phase": 2,
-                    "total_directories": len(numeric_dirs)
-                }
-                save_progress_cache(temp_dir, cache_data)
+                # 询问是否进行二级筛选
+                total_no_match = len(phase1_no_match_expert.union(phase1_no_match_company))
+                print(f"\n{'='*70}")
+                print("是否进行二级筛选?")
+                print(f"  - 专家评审未找到: {len(phase1_no_match_expert)} 个目录")
+                print(f"  - 公司评审未找到: {len(phase1_no_match_company)} 个目录")
+                print(f"  - 扩大检查范围到前{phase2_pages}页")
+                print('='*70)
+
+                while True:
+                    try:
+                        user_input = input("请输入 (y/n): ").strip().lower()
+                        if user_input in ['y', 'yes', '是']:
+                            enable_phase2 = True
+                            break
+                        elif user_input in ['n', 'no', '否']:
+                            enable_phase2 = False
+                            break
+                        else:
+                            print("  请输入 y 或 n")
+                    except KeyboardInterrupt:
+                        print("\n\n用户中断,进度已保存")
+                        sys.exit(0)
+
+                if not enable_phase2:
+                    print("\n  跳过二级筛选,直接保存结果...")
+                    current_phase = 3  # 跳过二级筛选,直接保存
+                else:
+                    current_phase = 2
+                    # 保存进入第二阶段的标记
+                    cache_data = {
+                        "processed_dirs": list(processed_dirs),
+                        "phase1_no_match_expert": list(phase1_no_match_expert),
+                        "phase1_no_match_company": list(phase1_no_match_company),
+                        "results": all_results,
+                        "stats": stats,
+                        "phase": 2,
+                        "total_directories": len(numeric_dirs)
+                    }
+                    save_progress_cache(temp_dir, cache_data)
 
     # ==================== 二级筛选 ====================
     if current_phase == 2:
         print(f"\n{'='*70}")
-        print("【第2阶段】二级筛选(检查前30页)")
+        print(f"【第2阶段】二级筛选(检查前{phase2_pages}页)")
         print('='*70)
 
         # 重新加载以获取最新状态
@@ -1131,18 +1318,19 @@ def main():
             # 但由于 process_directory_worker 会同时处理两种类型,
             # 已成功的类型会再次被处理(但结果相同,不会重复复制因为文件名相同会覆盖)
             # 为了效率,我们只处理有未匹配的目录
-            
+
             stats, phase1_no_match_expert, phase1_no_match_company = run_phase_concurrently(
-                all_phase2_dirs, phase=2, max_pages=PHASE_2_PAGES,
+                all_phase2_dirs, phase=2, max_pages=phase2_pages,
                 expert_output_dir=expert_output_dir, company_output_dir=company_output_dir,
                 temp_dir=temp_dir, numeric_dirs=numeric_dirs,
                 stats=stats, processed_dirs=processed_dirs,
                 phase1_no_match_expert=phase1_no_match_expert,
                 phase1_no_match_company=phase1_no_match_company,
-                verbose=False,  # 二级筛选使用静默模式
-                cache_every=10
+                num_workers=num_workers,
+                verbose=verbose_mode,
+                cache_every=cache_every
             )
-            
+
             # 更新结果
             cache = load_progress_cache(temp_dir)
             all_results = cache.get("results", [])

+ 0 - 0
src/app/scripts/ceshi/temp/评审筛选进度缓存_已完成.json → src/app/scripts/plan_select/temp/评审筛选进度缓存_已完成.json