|
|
@@ -1,7 +1,7 @@
|
|
|
#!/usr/bin/env python3
|
|
|
# -*- coding: utf-8 -*-
|
|
|
"""
|
|
|
-评审意见PDF文件筛选脚本 - 二级筛选+断点续传版
|
|
|
+评审意见PDF文件筛选脚本 - 多进程并发版+测试模式
|
|
|
|
|
|
功能说明:
|
|
|
从raw/670目录下的数字编号子目录中筛选评审意见PDF文件。
|
|
|
@@ -21,6 +21,8 @@
|
|
|
6. 专家评审输出到output/expert_review目录
|
|
|
7. 公司/集团评审输出到output/company_review目录
|
|
|
8. 支持断点续传,在temp目录缓存处理进度
|
|
|
+ 9. 【新增】支持多进程并发筛选,提高效率
|
|
|
+ 10.【新增】支持随机抽取测试模式,快速验证
|
|
|
|
|
|
输入:
|
|
|
- 源目录: raw/670/ (包含数字编号子目录,如1567、1569等)
|
|
|
@@ -35,6 +37,7 @@
|
|
|
|
|
|
作者: Claude
|
|
|
日期: 2026-04-21
|
|
|
+更新: 2026-05-08 - 增加多进程并发和测试模式
|
|
|
"""
|
|
|
|
|
|
import pandas as pd
|
|
|
@@ -44,6 +47,8 @@ import os
|
|
|
import shutil
|
|
|
import re
|
|
|
import warnings
|
|
|
+import multiprocessing as mp
|
|
|
+import random
|
|
|
from pathlib import Path
|
|
|
from datetime import datetime
|
|
|
from typing import List, Dict, Tuple, Optional, Set
|
|
|
@@ -59,36 +64,211 @@ warnings.filterwarnings('ignore', category=Warning)
|
|
|
# 1) 填绝对路径(如 E:/data/raw/670)则直接使用(Windows 建议用 / 或 \\)
|
|
|
# 2) 填相对路径(如 ../../raw/670)则相对当前脚本目录解析
|
|
|
SOURCE_DIR = r"E:\提供的原始文件\原始文件\全部的原始文档\未提取"
|
|
|
-EXPERT_OUTPUT_DIR = r"F:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\专家评审意见_记录"
|
|
|
-COMPANY_OUTPUT_DIR = r"F:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明"
|
|
|
+EXPERT_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\专家评审意见_记录"
|
|
|
+COMPANY_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明"
|
|
|
TEMP_DIR = "temp"
|
|
|
|
|
|
|
|
|
+# ==================== 并发与测试配置 ====================
|
|
|
+# 多进程并发配置
|
|
|
+NUM_WORKERS = 4 # 并发进程数,建议设为CPU核心数(如CPU有8核则设为6-8)
|
|
|
+ # 注意:每个工作进程内部还会为单个PDF创建子进程(超时控制)
|
|
|
+ # 因此 NUM_WORKERS 不宜过大,避免进程过多导致系统资源耗尽
|
|
|
+
|
|
|
+# 测试模式配置
|
|
|
+TEST_MODE = False # 是否启用测试模式:随机抽取少量目录快速测试
|
|
|
+TEST_SAMPLE_SIZE = 5 # 测试模式下随机抽取的目录数量
|
|
|
+ # 测试完成后会输出结果并自动退出,不会进入二级筛选
|
|
|
+
|
|
|
+
|
|
|
# 分批配置(仅用于统计显示,不创建子目录)
|
|
|
BATCH_SIZE = 50 # 每批处理的目录数量(仅用于进度显示)
|
|
|
|
|
|
# 关键词配置
|
|
|
KEYWORDS = {
|
|
|
- # 专家意见:必须命中“专家审查”相关表述(仅“评审/评估”不算专家审查)
|
|
|
+ # 专家意见:涵盖"审查"、"评审"、"论证"、"咨询"、"签字"等多种表述
|
|
|
"expert": [
|
|
|
+ # === 专家审查系列(核心关键词)===
|
|
|
"专家审查意见", "专家审查记录", "专家审查结论",
|
|
|
"专家审查说明", "专家审查建议", "专家审查纪要", "专家审查报告",
|
|
|
"专家审查审核表", "专家审查审查表",
|
|
|
+
|
|
|
+ # === 专家评审系列(评审与审查并重)===
|
|
|
+ "专家评审意见", "专家评审记录", "专家评审结论",
|
|
|
+ "专家评审说明", "专家评审建议", "专家评审纪要", "专家评审报告",
|
|
|
+ "专家评审审核表", "专家评审表",
|
|
|
+
|
|
|
+ # === 专家论证系列(常见于危大工程)===
|
|
|
+ "专家论证意见", "专家论证记录", "专家论证结论",
|
|
|
+ "专家论证说明", "专家论证建议", "专家论证纪要", "专家论证报告",
|
|
|
+ "专家论证审核表", "专家论证审查表",
|
|
|
+
|
|
|
+ # === 专家组系列(多位专家集体意见)===
|
|
|
+ "专家组意见", "专家组审查意见", "专家组评审意见",
|
|
|
+ "专家组论证意见", "专家组建议", "专家组结论",
|
|
|
+ "专家组纪要", "专家组报告",
|
|
|
+
|
|
|
+ # === 专家咨询系列(技术咨询类)===
|
|
|
+ "专家咨询意见", "专家咨询建议", "专家咨询记录",
|
|
|
+ "专家咨询结论", "专家咨询说明",
|
|
|
+
|
|
|
+ # === 专家签字/签名系列(专家参与确认)===
|
|
|
+ "专家签字", "专家签名", "专家签章",
|
|
|
+ "专家签字表", "专家签名表", "专家签认",
|
|
|
+ "专家确认", "专家审核签字",
|
|
|
+
|
|
|
+ # === 专家意见回复/修改系列(回复与整改)===
|
|
|
+ "专家意见回复", "专家意见修改回复", "专家意见整改回复",
|
|
|
+ "专家意见回复表", "专家意见修改表", "专家意见回复单",
|
|
|
+
|
|
|
+ # === 专家意见通用表述(兜底关键词)===
|
|
|
+ "专家意见", "专家建议", "专家结论",
|
|
|
+ "专家名单", "专家签到表", "专家签到",
|
|
|
+
|
|
|
+ # === 常见简写/变体 ===
|
|
|
+ "专家意见及回复", "专家意见及整改", "专家意见及修改",
|
|
|
+ "专家审查结论表", "专家评审结论表", "专家论证结论表",
|
|
|
+ "专家技术意见", "专家技术审查", "专家技术评审",
|
|
|
],
|
|
|
- # 公司/集团:必须包含“公司”或“集团”主体表述
|
|
|
- "company": ["公司评审意见", "集团评审意见", "公司审核意见", "集团审核意见", "公司审查意见", "集团审查意见"] # 公司/集团关键词
|
|
|
+
|
|
|
+ # 公司/集团:涵盖"公司"、"集团"、"企业"、"项目部"、"总包"、"监理"等多种主体
|
|
|
+ "company": [
|
|
|
+ # === 公司/集团系列(核心关键词)===
|
|
|
+ "公司评审意见", "集团评审意见", "公司审核意见", "集团审核意见",
|
|
|
+ "公司审查意见", "集团审查意见",
|
|
|
+ "公司评审记录", "集团评审记录", "公司审核记录", "集团审核记录",
|
|
|
+ "公司审查记录", "集团审查记录",
|
|
|
+ "公司评审纪要", "集团评审纪要", "公司审核纪要", "集团审核纪要",
|
|
|
+ "公司审查纪要", "集团审查纪要",
|
|
|
+ "公司评审报告", "集团评审报告", "公司审核报告", "集团审核报告",
|
|
|
+ "公司审查报告", "集团审查报告",
|
|
|
+
|
|
|
+ # === 企业系列(企业为主体)===
|
|
|
+ "企业评审意见", "企业审核意见", "企业审查意见",
|
|
|
+ "企业评审记录", "企业审核记录", "企业审查记录",
|
|
|
+ "企业评审纪要", "企业审核纪要", "企业审查纪要",
|
|
|
+ "企业技术负责人意见", "企业技术负责人审核",
|
|
|
+
|
|
|
+ # === 项目部/项目系列(项目部为主体)===
|
|
|
+ "项目部评审意见", "项目部审核意见", "项目部审查意见",
|
|
|
+ "项目部评审记录", "项目部审核记录", "项目部审查记录",
|
|
|
+ "项目评审意见", "项目审核意见", "项目审查意见",
|
|
|
+ "项目经理意见", "项目经理审核",
|
|
|
+
|
|
|
+ # === 总包/总承包系列(总包单位为主体)===
|
|
|
+ "总包评审意见", "总包审核意见", "总包审查意见",
|
|
|
+ "总承包评审意见", "总承包审核意见", "总承包审查意见",
|
|
|
+ "总包单位意见", "总承包单位意见",
|
|
|
+
|
|
|
+ # === 内部评审系列(内部流程)===
|
|
|
+ "内部评审意见", "内部审核意见", "内部审查意见",
|
|
|
+ "内部评审记录", "内部审核记录", "内部审查记录",
|
|
|
+ "内部评审纪要", "内部审核纪要", "内部审查纪要",
|
|
|
+ "内部审批意见", "内部会签意见",
|
|
|
+
|
|
|
+ # === 审批/会签系列(审批流程)===
|
|
|
+ "公司审批意见", "集团审批意见", "企业审批意见",
|
|
|
+ "公司会签意见", "集团会签意见", "企业会签意见",
|
|
|
+ "审批意见", "会签意见", "审批记录",
|
|
|
+
|
|
|
+ # === 监理系列(监理单位意见)===
|
|
|
+ "监理评审意见", "监理审核意见", "监理审查意见",
|
|
|
+ "监理记录", "监理纪要", "监理报告",
|
|
|
+ "监理工程师意见", "总监理工程师意见",
|
|
|
+
|
|
|
+ # === 常见简写/变体 ===
|
|
|
+ "公司意见", "集团意见", "企业意见",
|
|
|
+ "公司技术意见", "集团技术意见",
|
|
|
+ "施工方案评审意见", "施工方案审核意见", "施工方案审查意见",
|
|
|
+ "方案评审意见", "方案审核意见", "方案审查意见",
|
|
|
+ "评审会纪要", "审核会纪要", "审查会纪要",
|
|
|
+ "评审会议记录", "审核会议记录", "审查会议记录",
|
|
|
+ ]
|
|
|
}
|
|
|
|
|
|
# 更宽松但可控的规则,用于提升OCR/版式噪声下的命中率
|
|
|
KEYWORD_PATTERNS = {
|
|
|
"expert": [
|
|
|
- # 仅允许“审查”语义命中(避免把“专家评审/评估”误判为专家审查)
|
|
|
- r"专家.{0,12}审查.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|审查表)",
|
|
|
- r"审查.{0,10}专家.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|审查表)",
|
|
|
+ # === 专家审查系列 ===
|
|
|
+ r"专家.{0,12}审查.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|审查表|结论表)",
|
|
|
+ r"审查.{0,10}专家.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|审查表|结论表)",
|
|
|
+
|
|
|
+ # === 专家评审系列 ===
|
|
|
+ r"专家.{0,12}评审.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|评审表|结论表)",
|
|
|
+ r"评审.{0,10}专家.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|评审表|结论表)",
|
|
|
+
|
|
|
+ # === 专家论证系列(危大工程常见)===
|
|
|
+ r"专家.{0,12}论证.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|论证表|结论表)",
|
|
|
+ r"论证.{0,10}专家.{0,12}(意见|记录|结论|说明|建议|纪要|报告|审核表|论证表|结论表)",
|
|
|
+
|
|
|
+ # === 专家组系列 ===
|
|
|
+ r"专家组.{0,10}(审查|评审|论证).{0,10}(意见|记录|结论|说明|建议|纪要|报告)",
|
|
|
+ r"专家组.{0,10}(意见|建议|结论)",
|
|
|
+
|
|
|
+ # === 专家咨询系列 ===
|
|
|
+ r"专家.{0,10}咨询.{0,10}(意见|建议|记录|结论|说明)",
|
|
|
+ r"咨询.{0,8}专家.{0,10}(意见|建议|记录|结论|说明)",
|
|
|
+
|
|
|
+ # === 专家签字/签名系列 ===
|
|
|
+ r"专家.{0,8}(签字|签名|签章|签认|确认)",
|
|
|
+ r"(签字|签名|签章).{0,8}专家",
|
|
|
+
|
|
|
+ # === 专家意见回复/修改系列 ===
|
|
|
+ r"专家.{0,8}意见.{0,8}(回复|修改|整改).{0,8}(表|单|记录)",
|
|
|
+ r"(审查|评审|论证).{0,8}意见.{0,8}(回复|修改|整改).{0,8}(表|单|记录)",
|
|
|
+ r"(审查|评审|论证).{0,8}意见.{0,8}回复",
|
|
|
+ r"对.{0,10}专家.{0,10}意见.{0,10}回复",
|
|
|
+
|
|
|
+ # === 专家通用表述(兜底)===
|
|
|
+ r"专家.{0,15}(意见|建议|结论)",
|
|
|
+ r"专家.{0,8}(名单|签到)",
|
|
|
+ r"专家.{0,12}(技术.{0,4})?(审查|评审|论证)",
|
|
|
+
|
|
|
+ # === 常见简写变体 ===
|
|
|
+ r"专家.{0,8}(审查|评审|论证).{0,8}结论",
|
|
|
+ r"专家.{0,10}意见.{0,10}(及|和).{0,10}(回复|修改|整改)",
|
|
|
],
|
|
|
+
|
|
|
"company": [
|
|
|
- r"(公司|集团).{0,10}(评审|审核|审查).{0,10}(意见|说明|记录)",
|
|
|
- r"(公司|集团).{0,10}施工方案.{0,10}(审核|审查|评审).{0,10}(意见|说明|记录)",
|
|
|
+ # === 公司/集团系列 ===
|
|
|
+ r"(公司|集团).{0,10}(评审|审核|审查).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+ r"(公司|集团).{0,10}施工方案.{0,10}(审核|审查|评审).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+ r"(公司|集团).{0,10}(审批|会签).{0,10}意见",
|
|
|
+ r"(公司|集团).{0,10}技术.{0,8}(负责人|主管).{0,8}意见",
|
|
|
+
|
|
|
+ # === 企业系列 ===
|
|
|
+ r"企业.{0,10}(评审|审核|审查).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+ r"企业.{0,10}技术.{0,8}(负责人|主管).{0,8}(意见|审核)",
|
|
|
+
|
|
|
+ # === 项目部系列 ===
|
|
|
+ r"项目(部)?.{0,10}(评审|审核|审查).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+ r"项目(部)?.{0,10}经理.{0,8}(意见|审核)",
|
|
|
+ r"项目(部)?.{0,10}技术.{0,8}(负责人|主管).{0,8}(意见|审核)",
|
|
|
+
|
|
|
+ # === 总包/总承包系列 ===
|
|
|
+ r"(总包|总承包).{0,10}(评审|审核|审查).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+ r"(总包|总承包).{0,10}单位.{0,8}意见",
|
|
|
+
|
|
|
+ # === 内部评审系列 ===
|
|
|
+ r"内部.{0,10}(评审|审核|审查|审批|会签).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+
|
|
|
+ # === 监理系列 ===
|
|
|
+ r"监理.{0,10}(评审|审核|审查).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+ r"监理.{0,8}工程师.{0,8}意见",
|
|
|
+ r"总监理.{0,8}工程师.{0,8}意见",
|
|
|
+
|
|
|
+ # === 施工方案评审系列 ===
|
|
|
+ r"施工方案.{0,10}(评审|审核|审查).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+ r"方案.{0,10}(评审|审核|审查).{0,10}(意见|说明|记录|纪要|报告)",
|
|
|
+
|
|
|
+ # === 会议/纪要系列 ===
|
|
|
+ r"(评审|审核|审查).{0,8}会.{0,8}(纪要|记录)",
|
|
|
+ r"(评审|审核|审查).{0,8}会议.{0,8}(纪要|记录)",
|
|
|
+ r"(评审|审核|审查).{0,8}纪要",
|
|
|
+
|
|
|
+ # === 通用兜底 ===
|
|
|
+ r"(公司|集团|企业).{0,8}意见",
|
|
|
+ r"(公司|集团|企业).{0,8}技术.{0,8}意见",
|
|
|
],
|
|
|
}
|
|
|
|
|
|
@@ -141,14 +321,12 @@ def _extract_pdf_text_worker(pdf_path_str: str, max_pages: int, result_queue):
|
|
|
result_queue.put({"ok": False, "error": str(e)})
|
|
|
|
|
|
|
|
|
-def extract_text_with_pages(pdf_path: Path, max_pages: int, timeout_seconds: int = 30) -> str:
|
|
|
+def extract_text_with_pages(pdf_path: Path, max_pages: int, timeout_seconds: int = 30, verbose: bool = True) -> str:
|
|
|
"""从PDF文件中提取文本内容(指定页数),带硬超时机制(子进程)"""
|
|
|
- import multiprocessing as mp
|
|
|
-
|
|
|
text = ""
|
|
|
file_size_mb = pdf_path.stat().st_size / (1024 * 1024)
|
|
|
|
|
|
- if file_size_mb > 50:
|
|
|
+ if verbose and file_size_mb > 50:
|
|
|
print(f"\n [大文件 {file_size_mb:.1f}MB,读取中...]", end="", flush=True)
|
|
|
|
|
|
try:
|
|
|
@@ -165,24 +343,27 @@ def extract_text_with_pages(pdf_path: Path, max_pages: int, timeout_seconds: int
|
|
|
if process.is_alive():
|
|
|
process.terminate()
|
|
|
process.join(timeout=2)
|
|
|
- print(f" [超时跳过]", end="", flush=True)
|
|
|
+ if verbose:
|
|
|
+ print(f" [超时跳过]", end="", flush=True)
|
|
|
return ""
|
|
|
|
|
|
result = result_queue.get_nowait() if not result_queue.empty() else {"ok": False, "error": "子进程无返回结果"}
|
|
|
|
|
|
if not result.get("ok"):
|
|
|
error_msg = result.get("error", "")
|
|
|
- if "PyCryptodome" in error_msg or "AES" in error_msg:
|
|
|
- print(f" [加密PDF需PyCryptodome]", end="", flush=True)
|
|
|
- elif "Password" in error_msg or "password" in error_msg:
|
|
|
- print(f" [PDF加密需要密码]", end="", flush=True)
|
|
|
- else:
|
|
|
- print(f" [读取失败]", end="", flush=True)
|
|
|
+ if verbose:
|
|
|
+ if "PyCryptodome" in error_msg or "AES" in error_msg:
|
|
|
+ print(f" [加密PDF需PyCryptodome]", end="", flush=True)
|
|
|
+ elif "Password" in error_msg or "password" in error_msg:
|
|
|
+ print(f" [PDF加密需要密码]", end="", flush=True)
|
|
|
+ else:
|
|
|
+ print(f" [读取失败]", end="", flush=True)
|
|
|
return ""
|
|
|
|
|
|
text = result.get("text", "")
|
|
|
except Exception:
|
|
|
- print(f" [读取错误]", end="", flush=True)
|
|
|
+ if verbose:
|
|
|
+ print(f" [读取错误]", end="", flush=True)
|
|
|
|
|
|
return text
|
|
|
|
|
|
@@ -206,7 +387,7 @@ def check_pdf_contains_keywords_with_pages(pdf_path: Path, keywords: List[str],
|
|
|
def extract_docx_text(docx_path: Path, max_pages: int) -> str:
|
|
|
"""提取DOCX文本(按段落近似页数限制)"""
|
|
|
try:
|
|
|
- # Word没有固定分页信息,这里用“每页约40段”进行近似截断,避免读取过慢。
|
|
|
+ # Word没有固定分页信息,这里用"每页约40段"进行近似截断,避免读取过慢。
|
|
|
approx_max_paragraphs = max(1, max_pages * 40)
|
|
|
doc = Document(str(docx_path))
|
|
|
texts = []
|
|
|
@@ -220,15 +401,16 @@ def extract_docx_text(docx_path: Path, max_pages: int) -> str:
|
|
|
return ""
|
|
|
|
|
|
|
|
|
-def check_file_contains_keywords_with_pages(file_path: Path, keywords: List[str], max_pages: int) -> Tuple[bool, str]:
|
|
|
+def check_file_contains_keywords_with_pages(file_path: Path, keywords: List[str], max_pages: int, verbose: bool = True) -> Tuple[bool, str]:
|
|
|
"""检查文件(PDF/DOCX)指定范围内是否包含任一关键词"""
|
|
|
suffix = file_path.suffix.lower()
|
|
|
if suffix == ".pdf":
|
|
|
- text = extract_text_with_pages(file_path, max_pages=max_pages)
|
|
|
+ text = extract_text_with_pages(file_path, max_pages=max_pages, verbose=verbose)
|
|
|
elif suffix == ".docx":
|
|
|
text = extract_docx_text(file_path, max_pages=max_pages)
|
|
|
elif suffix == ".doc":
|
|
|
- print(" [DOC暂不支持,跳过]", end="", flush=True)
|
|
|
+ if verbose:
|
|
|
+ print(" [DOC暂不支持,跳过]", end="", flush=True)
|
|
|
return False, ""
|
|
|
else:
|
|
|
return False, ""
|
|
|
@@ -285,7 +467,7 @@ def get_newest_file(files: List[Path]) -> Optional[Path]:
|
|
|
return max(files, key=lambda f: get_file_creation_time(f))
|
|
|
|
|
|
|
|
|
-def process_single_directory_phase(dir_path: Path, phase: int, max_pages: int, review_type: str) -> Tuple[bool, str, Optional[Path], List[Path], str]:
|
|
|
+def process_single_directory_phase(dir_path: Path, phase: int, max_pages: int, review_type: str, verbose: bool = True) -> Tuple[bool, str, Optional[Path], List[Path], str]:
|
|
|
"""
|
|
|
处理单个目录的文件筛选(指定阶段和评审类型)
|
|
|
|
|
|
@@ -294,6 +476,7 @@ def process_single_directory_phase(dir_path: Path, phase: int, max_pages: int, r
|
|
|
phase: 阶段(1或2)
|
|
|
max_pages: 检查的最大页数
|
|
|
review_type: 评审类型 ('expert' 或 'company')
|
|
|
+ verbose: 是否打印详细日志
|
|
|
|
|
|
Returns:
|
|
|
(是否成功, 状态信息, 选中的文件路径, 所有包含关键词的文件列表, 匹配到的关键词)
|
|
|
@@ -314,20 +497,24 @@ def process_single_directory_phase(dir_path: Path, phase: int, max_pages: int, r
|
|
|
matched_in_top5 = []
|
|
|
matched_keyword_top5 = ""
|
|
|
phase_str = f"【第{phase}阶段-{review_type}】"
|
|
|
- print(f"\n {phase_str} 目录: {dir_path.name} - 共{len(candidate_files)}个文件,检查前{max_pages}页,先检查Top5...")
|
|
|
+ if verbose:
|
|
|
+ print(f"\n {phase_str} 目录: {dir_path.name} - 共{len(candidate_files)}个文件,检查前{max_pages}页,先检查Top5...")
|
|
|
|
|
|
for file_path in top5_files:
|
|
|
size_mb = get_file_size(file_path) / (1024 * 1024)
|
|
|
- print(f" 检查Top5: {file_path.name[:30]}... (大小: {size_mb:.2f}MB)", end="", flush=True)
|
|
|
+ if verbose:
|
|
|
+ print(f" 检查Top5: {file_path.name[:30]}... (大小: {size_mb:.2f}MB)", end="", flush=True)
|
|
|
|
|
|
- is_match, matched_kw = check_file_contains_keywords_with_pages(file_path, keywords, max_pages)
|
|
|
+ is_match, matched_kw = check_file_contains_keywords_with_pages(file_path, keywords, max_pages, verbose=verbose)
|
|
|
if is_match:
|
|
|
- print(f" -> ✓ 包含关键词[{matched_kw}]")
|
|
|
+ if verbose:
|
|
|
+ print(f" -> ✓ 包含关键词[{matched_kw}]")
|
|
|
matched_in_top5.append(file_path)
|
|
|
if not matched_keyword_top5:
|
|
|
matched_keyword_top5 = matched_kw
|
|
|
else:
|
|
|
- print(" -> ✗ 无关键词")
|
|
|
+ if verbose:
|
|
|
+ print(" -> ✗ 无关键词")
|
|
|
|
|
|
# Top5中找到匹配
|
|
|
if matched_in_top5:
|
|
|
@@ -336,7 +523,8 @@ def process_single_directory_phase(dir_path: Path, phase: int, max_pages: int, r
|
|
|
return True, f"Top5中找到{len(matched_in_top5)}个匹配,选择最新", selected, matched_in_top5, matched_keyword_top5
|
|
|
return True, "Top5中找到匹配文件", selected, matched_in_top5, matched_keyword_top5
|
|
|
|
|
|
- print(f" Top5未找到,扩展到其余{len(candidate_files) - len(top5_files)}个文件...")
|
|
|
+ if verbose:
|
|
|
+ print(f" Top5未找到,扩展到其余{len(candidate_files) - len(top5_files)}个文件...")
|
|
|
|
|
|
# 检查其余文件
|
|
|
other_files = [f for f in candidate_files if f not in top5_files]
|
|
|
@@ -345,16 +533,19 @@ def process_single_directory_phase(dir_path: Path, phase: int, max_pages: int, r
|
|
|
|
|
|
for file_path in other_files:
|
|
|
size_mb = get_file_size(file_path) / (1024 * 1024)
|
|
|
- print(f" 检查其他: {file_path.name[:30]}... (大小: {size_mb:.2f}MB)", end="", flush=True)
|
|
|
+ if verbose:
|
|
|
+ print(f" 检查其他: {file_path.name[:30]}... (大小: {size_mb:.2f}MB)", end="", flush=True)
|
|
|
|
|
|
- is_match, matched_kw = check_file_contains_keywords_with_pages(file_path, keywords, max_pages)
|
|
|
+ is_match, matched_kw = check_file_contains_keywords_with_pages(file_path, keywords, max_pages, verbose=verbose)
|
|
|
if is_match:
|
|
|
- print(f" -> ✓ 包含关键词[{matched_kw}]")
|
|
|
+ if verbose:
|
|
|
+ print(f" -> ✓ 包含关键词[{matched_kw}]")
|
|
|
matched_in_others.append(file_path)
|
|
|
if not matched_keyword_others:
|
|
|
matched_keyword_others = matched_kw
|
|
|
else:
|
|
|
- print(" -> ✗ 无关键词")
|
|
|
+ if verbose:
|
|
|
+ print(" -> ✗ 无关键词")
|
|
|
|
|
|
if matched_in_others:
|
|
|
selected = get_newest_file(matched_in_others)
|
|
|
@@ -374,14 +565,14 @@ def copy_file_to_output(file_path: Path, output_dir: Path, new_name: str) -> Pat
|
|
|
|
|
|
|
|
|
def process_review_type(dir_path: Path, phase: int, max_pages: int, review_type: str,
|
|
|
- output_dir: Path, dir_id: str, stats: dict, results: list) -> Tuple[bool, Optional[Path]]:
|
|
|
+ output_dir: Path, dir_id: str, verbose: bool = True) -> Tuple[bool, Optional[Path], Optional[Dict]]:
|
|
|
"""处理单个评审类型的筛选和输出
|
|
|
|
|
|
Returns:
|
|
|
- (是否成功, 选中的文件路径)
|
|
|
+ (是否成功, 选中的文件路径, 结果记录字典或None)
|
|
|
"""
|
|
|
success, message, selected_file, all_matched, matched_kw = process_single_directory_phase(
|
|
|
- dir_path, phase=phase, max_pages=max_pages, review_type=review_type
|
|
|
+ dir_path, phase=phase, max_pages=max_pages, review_type=review_type, verbose=verbose
|
|
|
)
|
|
|
|
|
|
if success and selected_file:
|
|
|
@@ -396,9 +587,10 @@ def process_review_type(dir_path: Path, phase: int, max_pages: int, review_type:
|
|
|
new_filename = f"{dir_id}_{selected_file.name}"
|
|
|
try:
|
|
|
dest_path = copy_file_to_output(selected_file, output_dir, new_filename)
|
|
|
- print(f" ✅ [{review_type}] 已输出: {new_filename}")
|
|
|
+ if verbose:
|
|
|
+ print(f" ✅ [{review_type}] 已输出: {new_filename}")
|
|
|
|
|
|
- results.append({
|
|
|
+ result_record = {
|
|
|
'目录ID': dir_id,
|
|
|
'评审类型': review_type,
|
|
|
'阶段': f'第{phase}阶段',
|
|
|
@@ -413,11 +605,12 @@ def process_review_type(dir_path: Path, phase: int, max_pages: int, review_type:
|
|
|
'原路径': str(selected_file),
|
|
|
'目标路径': str(dest_path),
|
|
|
'处理时间': datetime.now().isoformat()
|
|
|
- })
|
|
|
- return True, selected_file
|
|
|
+ }
|
|
|
+ return True, selected_file, result_record
|
|
|
except Exception as e:
|
|
|
- print(f" ❌ [{review_type}] 复制失败: {e}")
|
|
|
- results.append({
|
|
|
+ if verbose:
|
|
|
+ print(f" ❌ [{review_type}] 复制失败: {e}")
|
|
|
+ result_record = {
|
|
|
'目录ID': dir_id,
|
|
|
'评审类型': review_type,
|
|
|
'阶段': f'第{phase}阶段',
|
|
|
@@ -432,11 +625,12 @@ def process_review_type(dir_path: Path, phase: int, max_pages: int, review_type:
|
|
|
'原路径': str(selected_file),
|
|
|
'目标路径': '',
|
|
|
'处理时间': datetime.now().isoformat()
|
|
|
- })
|
|
|
- return False, None
|
|
|
+ }
|
|
|
+ return False, None, result_record
|
|
|
else:
|
|
|
- print(f" ❌ [{review_type}] {message}")
|
|
|
- return False, None
|
|
|
+ if verbose:
|
|
|
+ print(f" ❌ [{review_type}] {message}")
|
|
|
+ return False, None, None
|
|
|
|
|
|
|
|
|
def get_numeric_directories(base_dir: Path) -> List[Path]:
|
|
|
@@ -534,10 +728,175 @@ def resolve_config_path(path_value: str, script_dir: Path) -> Path:
|
|
|
return (script_dir / path).resolve()
|
|
|
|
|
|
|
|
|
+# ==================== 多进程工作函数 ====================
|
|
|
+
|
|
|
+def process_directory_worker(args_tuple):
|
|
|
+ """
|
|
|
+ 工作进程函数:处理单个目录的两种评审类型
|
|
|
+
|
|
|
+ 此函数在独立的工作进程中运行,同时处理专家评审和公司评审。
|
|
|
+ 工作进程之间互不影响,各自独立复制文件到输出目录。
|
|
|
+
|
|
|
+ Args:
|
|
|
+ args_tuple: (
|
|
|
+ dir_path_str, # 目录路径字符串
|
|
|
+ phase, # 阶段(1或2)
|
|
|
+ max_pages, # 检查的最大页数
|
|
|
+ expert_output_dir_str, # 专家评审输出目录
|
|
|
+ company_output_dir_str, # 公司评审输出目录
|
|
|
+ verbose # 是否打印详细日志
|
|
|
+ )
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ dict: {
|
|
|
+ "dir_id": str, # 目录ID
|
|
|
+ "expert_success": bool, # 专家评审是否成功
|
|
|
+ "company_success": bool, # 公司评审是否成功
|
|
|
+ "results": list, # 结果记录列表(可能为空)
|
|
|
+ }
|
|
|
+ """
|
|
|
+ dir_path_str, phase, max_pages, expert_output_dir_str, company_output_dir_str, verbose = args_tuple
|
|
|
+
|
|
|
+ dir_path = Path(dir_path_str)
|
|
|
+ expert_output_dir = Path(expert_output_dir_str)
|
|
|
+ company_output_dir = Path(company_output_dir_str)
|
|
|
+ dir_id = dir_path.name
|
|
|
+
|
|
|
+ results = []
|
|
|
+
|
|
|
+ # 处理专家评审
|
|
|
+ expert_success, expert_file, expert_result = process_review_type(
|
|
|
+ dir_path, phase=phase, max_pages=max_pages,
|
|
|
+ review_type="expert", output_dir=expert_output_dir,
|
|
|
+ dir_id=dir_id, verbose=verbose
|
|
|
+ )
|
|
|
+ if expert_result:
|
|
|
+ results.append(expert_result)
|
|
|
+
|
|
|
+ # 处理公司评审
|
|
|
+ company_success, company_file, company_result = process_review_type(
|
|
|
+ dir_path, phase=phase, max_pages=max_pages,
|
|
|
+ review_type="company", output_dir=company_output_dir,
|
|
|
+ dir_id=dir_id, verbose=verbose
|
|
|
+ )
|
|
|
+ if company_result:
|
|
|
+ results.append(company_result)
|
|
|
+
|
|
|
+ return {
|
|
|
+ "dir_id": dir_id,
|
|
|
+ "expert_success": expert_success,
|
|
|
+ "company_success": company_success,
|
|
|
+ "results": results,
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: int,
|
|
|
+ expert_output_dir: Path, company_output_dir: Path,
|
|
|
+ temp_dir: Path, numeric_dirs: List[Path],
|
|
|
+ stats: Dict, processed_dirs: Set,
|
|
|
+ phase1_no_match_expert: Set, phase1_no_match_company: Set,
|
|
|
+ verbose: bool = False, cache_every: int = 10) -> Tuple[Dict, Set, Set]:
|
|
|
+ """
|
|
|
+ 并发运行一个阶段的筛选
|
|
|
+
|
|
|
+ Args:
|
|
|
+ dirs_to_process: 待处理的目录列表
|
|
|
+ phase: 阶段(1或2)
|
|
|
+ max_pages: 检查的最大页数
|
|
|
+ expert_output_dir: 专家评审输出目录
|
|
|
+ company_output_dir: 公司评审输出目录
|
|
|
+ temp_dir: 缓存目录
|
|
|
+ numeric_dirs: 所有数字目录(用于统计总数)
|
|
|
+ stats: 统计字典(会被修改)
|
|
|
+ processed_dirs: 已处理目录集合(会被修改)
|
|
|
+ phase1_no_match_expert: 一级未匹配专家评审的目录集合(会被修改)
|
|
|
+ phase1_no_match_company: 一级未匹配公司评审的目录集合(会被修改)
|
|
|
+ verbose: 工作进程是否打印详细日志
|
|
|
+ cache_every: 每处理多少个目录保存一次缓存
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ (stats, phase1_no_match_expert, phase1_no_match_company)
|
|
|
+ """
|
|
|
+ total = len(dirs_to_process)
|
|
|
+ completed = 0
|
|
|
+ all_results = []
|
|
|
+
|
|
|
+ # 构建参数列表
|
|
|
+ args_list = []
|
|
|
+ for dir_path in dirs_to_process:
|
|
|
+ args_list.append((
|
|
|
+ str(dir_path),
|
|
|
+ phase,
|
|
|
+ max_pages,
|
|
|
+ str(expert_output_dir),
|
|
|
+ str(company_output_dir),
|
|
|
+ verbose
|
|
|
+ ))
|
|
|
+
|
|
|
+ print(f"\n 启动 {NUM_WORKERS} 个并发进程处理 {total} 个目录...")
|
|
|
+ print(f" 工作模式: {'详细日志' if verbose else '静默模式(仅显示进度)'}\n")
|
|
|
+
|
|
|
+ # 使用进程池并发处理
|
|
|
+ with mp.Pool(processes=NUM_WORKERS) as pool:
|
|
|
+ # imap_unordered 不保证顺序,但返回速度最快
|
|
|
+ for result in pool.imap_unordered(process_directory_worker, args_list):
|
|
|
+ dir_id = result["dir_id"]
|
|
|
+
|
|
|
+ # 更新结果列表
|
|
|
+ all_results.extend(result["results"])
|
|
|
+
|
|
|
+ # 更新统计
|
|
|
+ if result["expert_success"]:
|
|
|
+ stats["expert_success_count"] = stats.get("expert_success_count", 0) + 1
|
|
|
+ # 如果之前标记为未匹配,现在成功了,移除标记
|
|
|
+ phase1_no_match_expert.discard(dir_id)
|
|
|
+ else:
|
|
|
+ # 只有在一级筛选时才添加未匹配标记
|
|
|
+ if phase == 1:
|
|
|
+ phase1_no_match_expert.add(dir_id)
|
|
|
+
|
|
|
+ if result["company_success"]:
|
|
|
+ stats["company_success_count"] = stats.get("company_success_count", 0) + 1
|
|
|
+ phase1_no_match_company.discard(dir_id)
|
|
|
+ else:
|
|
|
+ if phase == 1:
|
|
|
+ phase1_no_match_company.add(dir_id)
|
|
|
+
|
|
|
+ # 标记为已处理
|
|
|
+ processed_dirs.add(dir_id)
|
|
|
+ completed += 1
|
|
|
+
|
|
|
+ # 显示进度
|
|
|
+ progress = completed / total * 100
|
|
|
+ print(f"\r 进度: {completed}/{total} ({progress:.1f}%) | "
|
|
|
+ f"专家成功: {stats.get('expert_success_count', 0)} | "
|
|
|
+ f"公司成功: {stats.get('company_success_count', 0)} | "
|
|
|
+ f"当前: {dir_id}", end="", flush=True)
|
|
|
+
|
|
|
+ # 定期保存缓存
|
|
|
+ if completed % cache_every == 0 or completed == total:
|
|
|
+ cache_data = {
|
|
|
+ "processed_dirs": list(processed_dirs),
|
|
|
+ "phase1_no_match_expert": list(phase1_no_match_expert),
|
|
|
+ "phase1_no_match_company": list(phase1_no_match_company),
|
|
|
+ "results": all_results,
|
|
|
+ "stats": stats,
|
|
|
+ "phase": phase,
|
|
|
+ "total_directories": len(numeric_dirs)
|
|
|
+ }
|
|
|
+ save_progress_cache(temp_dir, cache_data)
|
|
|
+
|
|
|
+ print(f"\n\n ✅ 阶段完成!处理 {completed} 个目录")
|
|
|
+ print(f" 专家评审成功: {stats.get('expert_success_count', 0)} 个")
|
|
|
+ print(f" 公司评审成功: {stats.get('company_success_count', 0)} 个")
|
|
|
+
|
|
|
+ return stats, phase1_no_match_expert, phase1_no_match_company
|
|
|
+
|
|
|
+
|
|
|
def main():
|
|
|
"""主函数"""
|
|
|
print("=" * 70)
|
|
|
- print("评审意见PDF筛选脚本 - 二级筛选+断点续传版")
|
|
|
+ print("评审意见PDF筛选脚本 - 多进程并发版+测试模式")
|
|
|
print("=" * 70)
|
|
|
|
|
|
# 按文件首部配置组装路径(不再按项目根目录拼接)
|
|
|
@@ -552,6 +911,8 @@ def main():
|
|
|
print(f" 专家评审输出目录: {expert_output_dir}")
|
|
|
print(f" 公司评审输出目录: {company_output_dir}")
|
|
|
print(f" 缓存目录: {temp_dir}")
|
|
|
+ print(f" 并发进程数: {NUM_WORKERS}")
|
|
|
+ print(f" 测试模式: {'是(抽取5个目录)' if TEST_MODE else '否'}")
|
|
|
print(f" 专家评审关键词: {KEYWORDS['expert']}")
|
|
|
print(f" 公司评审关键词: {KEYWORDS['company']}")
|
|
|
print(f" 一级筛选: 前{PHASE_1_PAGES}页")
|
|
|
@@ -575,23 +936,50 @@ def main():
|
|
|
sys.exit(1)
|
|
|
|
|
|
print(f" 找到 {len(numeric_dirs)} 个数字编号子目录")
|
|
|
-
|
|
|
- # 加载缓存(断点续传)
|
|
|
- print(f"\n【步骤 2/6】加载进度缓存...")
|
|
|
- cache = load_progress_cache(temp_dir)
|
|
|
+
|
|
|
+ # ==================== 测试模式:随机抽取 ====================
|
|
|
+ if TEST_MODE:
|
|
|
+ print(f"\n【测试模式】随机抽取 {TEST_SAMPLE_SIZE} 个目录进行测试...")
|
|
|
+ if len(numeric_dirs) <= TEST_SAMPLE_SIZE:
|
|
|
+ test_dirs = numeric_dirs
|
|
|
+ print(f" 目录总数不足 {TEST_SAMPLE_SIZE},测试全部 {len(numeric_dirs)} 个目录")
|
|
|
+ else:
|
|
|
+ # 使用固定随机种子,确保可复现
|
|
|
+ random.seed(42)
|
|
|
+ test_dirs = random.sample(numeric_dirs, TEST_SAMPLE_SIZE)
|
|
|
+ test_dirs.sort(key=lambda d: int(d.name)) # 按数字排序,方便查看
|
|
|
+
|
|
|
+ numeric_dirs = test_dirs
|
|
|
+ print(f" 测试目录: {[d.name for d in test_dirs]}")
|
|
|
+ # 测试模式不加载缓存,不进入二级筛选
|
|
|
+ cache = {
|
|
|
+ "processed_dirs": [],
|
|
|
+ "phase1_no_match_expert": [],
|
|
|
+ "phase1_no_match_company": [],
|
|
|
+ "results": [],
|
|
|
+ "stats": {},
|
|
|
+ "phase": 1,
|
|
|
+ "last_update": None
|
|
|
+ }
|
|
|
+ else:
|
|
|
+ # 加载缓存(断点续传)
|
|
|
+ print(f"\n【步骤 2/6】加载进度缓存...")
|
|
|
+ cache = load_progress_cache(temp_dir)
|
|
|
+
|
|
|
processed_dirs = set(cache.get("processed_dirs", []))
|
|
|
phase1_no_match_expert = set(cache.get("phase1_no_match_expert", []))
|
|
|
phase1_no_match_company = set(cache.get("phase1_no_match_company", []))
|
|
|
current_phase = cache.get("phase", 1)
|
|
|
+ all_results = cache.get("results", [])
|
|
|
|
|
|
- if processed_dirs:
|
|
|
+ if processed_dirs and not TEST_MODE:
|
|
|
print(f" 发现缓存:")
|
|
|
print(f" - 已处理: {len(processed_dirs)} 个目录")
|
|
|
print(f" - 专家评审一级未找到: {len(phase1_no_match_expert)} 个目录")
|
|
|
print(f" - 公司评审一级未找到: {len(phase1_no_match_company)} 个目录")
|
|
|
print(f" - 当前阶段: 第{current_phase}阶段")
|
|
|
else:
|
|
|
- print(f" 无缓存,将从头开始处理")
|
|
|
+ print(f" {'无缓存(测试模式),将从头开始处理' if TEST_MODE else '无缓存,将从头开始处理'}")
|
|
|
|
|
|
# ==================== 一级筛选 ====================
|
|
|
if current_phase == 1:
|
|
|
@@ -610,8 +998,6 @@ def main():
|
|
|
print(f" 公司评审输出到: {company_output_dir}")
|
|
|
print()
|
|
|
|
|
|
- results = cache.get("results", [])
|
|
|
-
|
|
|
default_stats = {
|
|
|
"total_directories": len(numeric_dirs),
|
|
|
"expert_success_count": 0,
|
|
|
@@ -630,55 +1016,23 @@ def main():
|
|
|
if key not in stats:
|
|
|
stats[key] = value
|
|
|
|
|
|
- expert_success_count = 0
|
|
|
- company_success_count = 0
|
|
|
- total_to_process = len(dirs_to_process)
|
|
|
-
|
|
|
- for idx, dir_path in enumerate(dirs_to_process):
|
|
|
- dir_id = dir_path.name
|
|
|
- overall_idx = len(processed_dirs) + idx + 1
|
|
|
-
|
|
|
- print(f"\n[{overall_idx}/{len(numeric_dirs)}] 当前目录: {dir_id}")
|
|
|
- print_progress_bar(idx + 1, total_to_process)
|
|
|
-
|
|
|
- # 处理专家评审
|
|
|
- expert_success, expert_file = process_review_type(
|
|
|
- dir_path, phase=1, max_pages=PHASE_1_PAGES,
|
|
|
- review_type="expert", output_dir=expert_output_dir,
|
|
|
- dir_id=dir_id, stats=stats, results=results
|
|
|
- )
|
|
|
- if expert_success:
|
|
|
- stats["expert_success_count"] += 1
|
|
|
- expert_success_count += 1
|
|
|
- else:
|
|
|
- phase1_no_match_expert.add(dir_id)
|
|
|
-
|
|
|
- # 处理公司评审
|
|
|
- company_success, company_file = process_review_type(
|
|
|
- dir_path, phase=1, max_pages=PHASE_1_PAGES,
|
|
|
- review_type="company", output_dir=company_output_dir,
|
|
|
- dir_id=dir_id, stats=stats, results=results
|
|
|
- )
|
|
|
- if company_success:
|
|
|
- stats["company_success_count"] += 1
|
|
|
- company_success_count += 1
|
|
|
- else:
|
|
|
- phase1_no_match_company.add(dir_id)
|
|
|
-
|
|
|
- processed_dirs.add(dir_id)
|
|
|
-
|
|
|
- if (idx + 1) % 10 == 0 or idx == len(dirs_to_process) - 1:
|
|
|
- cache_data = {
|
|
|
- "processed_dirs": list(processed_dirs),
|
|
|
- "phase1_no_match_expert": list(phase1_no_match_expert),
|
|
|
- "phase1_no_match_company": list(phase1_no_match_company),
|
|
|
- "results": results,
|
|
|
- "stats": stats,
|
|
|
- "phase": 1,
|
|
|
- "total_directories": len(numeric_dirs)
|
|
|
- }
|
|
|
- save_progress_cache(temp_dir, cache_data)
|
|
|
- print(f"\n 💾 进度已缓存 (已处理 {len(processed_dirs)}/{len(numeric_dirs)} 个目录)")
|
|
|
+ # 并发处理一级筛选
|
|
|
+ # 在测试模式下使用详细日志(verbose=True),正式运行使用静默模式(verbose=False)
|
|
|
+ verbose_mode = TEST_MODE # 测试模式打印详细日志,正式模式静默
|
|
|
+ stats, phase1_no_match_expert, phase1_no_match_company = run_phase_concurrently(
|
|
|
+ dirs_to_process, phase=1, max_pages=PHASE_1_PAGES,
|
|
|
+ expert_output_dir=expert_output_dir, company_output_dir=company_output_dir,
|
|
|
+ temp_dir=temp_dir, numeric_dirs=numeric_dirs,
|
|
|
+ stats=stats, processed_dirs=processed_dirs,
|
|
|
+ phase1_no_match_expert=phase1_no_match_expert,
|
|
|
+ phase1_no_match_company=phase1_no_match_company,
|
|
|
+ verbose=verbose_mode,
|
|
|
+ cache_every=10
|
|
|
+ )
|
|
|
+
|
|
|
+ # 更新结果列表
|
|
|
+ cache = load_progress_cache(temp_dir)
|
|
|
+ all_results = cache.get("results", [])
|
|
|
|
|
|
print(f"\n\n【一级筛选完成】")
|
|
|
print(f" 专家评审成功: {stats['expert_success_count']} 个")
|
|
|
@@ -686,6 +1040,23 @@ def main():
|
|
|
print(f" 专家评审未找到: {len(phase1_no_match_expert)} 个")
|
|
|
print(f" 公司评审未找到: {len(phase1_no_match_company)} 个")
|
|
|
|
|
|
+ # 测试模式下直接退出,不进行二级筛选和保存
|
|
|
+ if TEST_MODE:
|
|
|
+ print(f"\n{'='*70}")
|
|
|
+ print("【测试模式完成】")
|
|
|
+ print(f" 共测试 {len(dirs_to_process)} 个目录")
|
|
|
+ print(f" 专家评审成功: {stats['expert_success_count']} 个")
|
|
|
+ print(f" 公司评审成功: {stats['company_success_count']} 个")
|
|
|
+ print(f" 测试结果已保存到缓存,可查看输出目录确认文件")
|
|
|
+ print("="*70)
|
|
|
+
|
|
|
+ # 测试模式也保存最终结果
|
|
|
+ _save_final_results(
|
|
|
+ temp_dir, expert_output_dir, company_output_dir,
|
|
|
+ numeric_dirs, all_results, stats, processed_dirs
|
|
|
+ )
|
|
|
+ return
|
|
|
+
|
|
|
# 询问是否进行二级筛选
|
|
|
total_no_match = len(phase1_no_match_expert.union(phase1_no_match_company))
|
|
|
print(f"\n{'='*70}")
|
|
|
@@ -720,7 +1091,7 @@ def main():
|
|
|
"processed_dirs": list(processed_dirs),
|
|
|
"phase1_no_match_expert": list(phase1_no_match_expert),
|
|
|
"phase1_no_match_company": list(phase1_no_match_company),
|
|
|
- "results": results,
|
|
|
+ "results": all_results,
|
|
|
"stats": stats,
|
|
|
"phase": 2,
|
|
|
"total_directories": len(numeric_dirs)
|
|
|
@@ -735,7 +1106,7 @@ def main():
|
|
|
|
|
|
# 重新加载以获取最新状态
|
|
|
cache = load_progress_cache(temp_dir)
|
|
|
- results = cache.get("results", [])
|
|
|
+ all_results = cache.get("results", [])
|
|
|
stats = cache.get("stats", {})
|
|
|
phase1_no_match_expert = set(cache.get("phase1_no_match_expert", []))
|
|
|
phase1_no_match_company = set(cache.get("phase1_no_match_company", []))
|
|
|
@@ -743,7 +1114,9 @@ def main():
|
|
|
# 获取需要二级筛选的目录(专家评审或公司评审任一未找到)
|
|
|
phase2_dirs_expert = [d for d in numeric_dirs if d.name in phase1_no_match_expert]
|
|
|
phase2_dirs_company = [d for d in numeric_dirs if d.name in phase1_no_match_company]
|
|
|
- all_phase2_dirs = set(phase2_dirs_expert + phase2_dirs_company)
|
|
|
+ all_phase2_dirs = list(set(phase2_dirs_expert + phase2_dirs_company))
|
|
|
+ # 按数字排序
|
|
|
+ all_phase2_dirs.sort(key=lambda d: int(d.name))
|
|
|
|
|
|
if not all_phase2_dirs:
|
|
|
print(f"\n 没有需要二级筛选的目录")
|
|
|
@@ -751,84 +1124,49 @@ def main():
|
|
|
print(f"\n【步骤 4/6】二级筛选处理...")
|
|
|
print(f" 专家评审需二级筛选: {len(phase2_dirs_expert)} 个目录")
|
|
|
print(f" 公司评审需二级筛选: {len(phase2_dirs_company)} 个目录")
|
|
|
-
|
|
|
- expert_phase2_success = 0
|
|
|
- company_phase2_success = 0
|
|
|
-
|
|
|
- # 处理专家评审二级筛选
|
|
|
- if phase2_dirs_expert:
|
|
|
- print(f"\n --- 专家评审二级筛选 ---")
|
|
|
- for idx, dir_path in enumerate(phase2_dirs_expert):
|
|
|
- dir_id = dir_path.name
|
|
|
- print(f"\n[{idx+1}/{len(phase2_dirs_expert)}] 专家评审二级筛选: {dir_id}")
|
|
|
-
|
|
|
- success, selected_file = process_review_type(
|
|
|
- dir_path, phase=2, max_pages=PHASE_2_PAGES,
|
|
|
- review_type="expert", output_dir=expert_output_dir,
|
|
|
- dir_id=dir_id, stats=stats, results=results
|
|
|
- )
|
|
|
- if success:
|
|
|
- stats["expert_phase2_success_count"] = stats.get("expert_phase2_success_count", 0) + 1
|
|
|
- expert_phase2_success += 1
|
|
|
- phase1_no_match_expert.discard(dir_id)
|
|
|
-
|
|
|
- if (idx + 1) % 10 == 0 or idx == len(phase2_dirs_expert) - 1:
|
|
|
- cache_data = {
|
|
|
- "processed_dirs": list(processed_dirs),
|
|
|
- "phase1_no_match_expert": list(phase1_no_match_expert),
|
|
|
- "phase1_no_match_company": list(phase1_no_match_company),
|
|
|
- "results": results,
|
|
|
- "stats": stats,
|
|
|
- "phase": 2,
|
|
|
- "total_directories": len(numeric_dirs)
|
|
|
- }
|
|
|
- save_progress_cache(temp_dir, cache_data)
|
|
|
-
|
|
|
- # 处理公司评审二级筛选
|
|
|
- if phase2_dirs_company:
|
|
|
- print(f"\n --- 公司评审二级筛选 ---")
|
|
|
- for idx, dir_path in enumerate(phase2_dirs_company):
|
|
|
- dir_id = dir_path.name
|
|
|
- print(f"\n[{idx+1}/{len(phase2_dirs_company)}] 公司评审二级筛选: {dir_id}")
|
|
|
-
|
|
|
- success, selected_file = process_review_type(
|
|
|
- dir_path, phase=2, max_pages=PHASE_2_PAGES,
|
|
|
- review_type="company", output_dir=company_output_dir,
|
|
|
- dir_id=dir_id, stats=stats, results=results
|
|
|
- )
|
|
|
- if success:
|
|
|
- stats["company_phase2_success_count"] = stats.get("company_phase2_success_count", 0) + 1
|
|
|
- company_phase2_success += 1
|
|
|
- phase1_no_match_company.discard(dir_id)
|
|
|
-
|
|
|
- if (idx + 1) % 10 == 0 or idx == len(phase2_dirs_company) - 1:
|
|
|
- cache_data = {
|
|
|
- "processed_dirs": list(processed_dirs),
|
|
|
- "phase1_no_match_expert": list(phase1_no_match_expert),
|
|
|
- "phase1_no_match_company": list(phase1_no_match_company),
|
|
|
- "results": results,
|
|
|
- "stats": stats,
|
|
|
- "phase": 2,
|
|
|
- "total_directories": len(numeric_dirs)
|
|
|
- }
|
|
|
- save_progress_cache(temp_dir, cache_data)
|
|
|
+ print(f" 总计需二级筛选: {len(all_phase2_dirs)} 个目录")
|
|
|
+
|
|
|
+ # 二级筛选也使用并发处理
|
|
|
+ # 注意:二级筛选时,之前已成功的目录不需要再处理
|
|
|
+ # 但由于 process_directory_worker 会同时处理两种类型,
|
|
|
+ # 已成功的类型会再次被处理(但结果相同,不会重复复制因为文件名相同会覆盖)
|
|
|
+ # 为了效率,我们只处理有未匹配的目录
|
|
|
+
|
|
|
+ stats, phase1_no_match_expert, phase1_no_match_company = run_phase_concurrently(
|
|
|
+ all_phase2_dirs, phase=2, max_pages=PHASE_2_PAGES,
|
|
|
+ expert_output_dir=expert_output_dir, company_output_dir=company_output_dir,
|
|
|
+ temp_dir=temp_dir, numeric_dirs=numeric_dirs,
|
|
|
+ stats=stats, processed_dirs=processed_dirs,
|
|
|
+ phase1_no_match_expert=phase1_no_match_expert,
|
|
|
+ phase1_no_match_company=phase1_no_match_company,
|
|
|
+ verbose=False, # 二级筛选使用静默模式
|
|
|
+ cache_every=10
|
|
|
+ )
|
|
|
+
|
|
|
+ # 更新结果
|
|
|
+ cache = load_progress_cache(temp_dir)
|
|
|
+ all_results = cache.get("results", [])
|
|
|
|
|
|
print(f"\n\n【二级筛选完成】")
|
|
|
- print(f" 专家评审二级筛选成功: {expert_phase2_success} 个")
|
|
|
- print(f" 公司评审二级筛选成功: {company_phase2_success} 个")
|
|
|
+ print(f" 专家评审二级筛选成功: {stats.get('expert_phase2_success_count', 0)} 个")
|
|
|
+ print(f" 公司评审二级筛选成功: {stats.get('company_phase2_success_count', 0)} 个")
|
|
|
|
|
|
current_phase = 3
|
|
|
|
|
|
# ==================== 保存最终结果 ====================
|
|
|
+ _save_final_results(
|
|
|
+ temp_dir, expert_output_dir, company_output_dir,
|
|
|
+ numeric_dirs, all_results, stats, processed_dirs
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+def _save_final_results(temp_dir: Path, expert_output_dir: Path, company_output_dir: Path,
|
|
|
+ numeric_dirs: List[Path], results: List[Dict], stats: Dict, processed_dirs: Set):
|
|
|
+ """保存最终结果(Excel、JSON统计、缺失目录ID等)"""
|
|
|
print(f"\n\n【步骤 5/6】保存最终结果...")
|
|
|
expert_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
company_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
- # 重新加载最新结果
|
|
|
- cache = load_progress_cache(temp_dir)
|
|
|
- results = cache.get("results", [])
|
|
|
- stats = cache.get("stats", {})
|
|
|
-
|
|
|
# 确保所有键存在
|
|
|
default_keys = [
|
|
|
"total_directories", "expert_success_count", "company_success_count",
|
|
|
@@ -854,7 +1192,7 @@ def main():
|
|
|
company_phase1 = stats.get("company_success_count", 0)
|
|
|
company_phase2 = stats.get("company_phase2_success_count", 0)
|
|
|
|
|
|
- # 基于最终结果反推“未采集到”的目录ID,避免阶段缓存集合被覆盖导致不准确
|
|
|
+ # 基于最终结果反推"未采集到"的目录ID,避免阶段缓存集合被覆盖导致不准确
|
|
|
all_dir_ids = {d.name for d in numeric_dirs}
|
|
|
expert_success_ids = {
|
|
|
str(r.get("目录ID"))
|
|
|
@@ -877,7 +1215,7 @@ def main():
|
|
|
f.write("\n".join(company_missing_ids))
|
|
|
|
|
|
final_stats = {
|
|
|
- "start_time": cache.get("start_time", datetime.now().isoformat()),
|
|
|
+ "start_time": stats.get("start_time", datetime.now().isoformat()),
|
|
|
"end_time": datetime.now().isoformat(),
|
|
|
"total_directories": len(numeric_dirs),
|
|
|
"processed_count": len(processed_dirs),
|
|
|
@@ -944,6 +1282,8 @@ def main():
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
+ # Windows 下多进程必须使用 freeze_support
|
|
|
+ mp.freeze_support()
|
|
|
try:
|
|
|
main()
|
|
|
except KeyboardInterrupt:
|