|
@@ -1,10 +1,10 @@
|
|
|
#!/usr/bin/env python3
|
|
#!/usr/bin/env python3
|
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
|
"""
|
|
"""
|
|
|
-评审意见PDF文件筛选脚本 - 多进程并发版+测试模式
|
|
|
|
|
|
|
+评审意见PDF文件筛选脚本 - 多进程并发版+测试模式+命令行参数
|
|
|
|
|
|
|
|
功能说明:
|
|
功能说明:
|
|
|
- 从raw/670目录下的数字编号子目录中筛选评审意见PDF文件。
|
|
|
|
|
|
|
+ 从源目录下的数字编号子目录中筛选评审意见PDF文件。
|
|
|
|
|
|
|
|
一级筛选(默认):
|
|
一级筛选(默认):
|
|
|
- 检查前15页是否包含"专家评审"或"公司评审"或"集团评审"
|
|
- 检查前15页是否包含"专家评审"或"公司评审"或"集团评审"
|
|
@@ -18,28 +18,32 @@
|
|
|
3. 如果top5中没找到,则将范围扩大到其余文件
|
|
3. 如果top5中没找到,则将范围扩大到其余文件
|
|
|
4. 如果都没找到,记录为"无评审意见"
|
|
4. 如果都没找到,记录为"无评审意见"
|
|
|
5. 如果多份文件都找到关键词,以创建时间最新的为准
|
|
5. 如果多份文件都找到关键词,以创建时间最新的为准
|
|
|
- 6. 专家评审输出到output/expert_review目录
|
|
|
|
|
- 7. 公司/集团评审输出到output/company_review目录
|
|
|
|
|
|
|
+ 6. 专家评审输出到指定目录
|
|
|
|
|
+ 7. 公司/集团评审输出到指定目录
|
|
|
8. 支持断点续传,在temp目录缓存处理进度
|
|
8. 支持断点续传,在temp目录缓存处理进度
|
|
|
- 9. 【新增】支持多进程并发筛选,提高效率
|
|
|
|
|
- 10.【新增】支持随机抽取测试模式,快速验证
|
|
|
|
|
|
|
+ 9. 支持多进程并发筛选,提高效率
|
|
|
|
|
+ 10. 支持随机抽取测试模式,快速验证
|
|
|
|
|
+ 11. 支持命令行参数配置路径和参数
|
|
|
|
|
|
|
|
-输入:
|
|
|
|
|
- - 源目录: raw/670/ (包含数字编号子目录,如1567、1569等)
|
|
|
|
|
- - 子目录中的PDF文件名是UUID格式
|
|
|
|
|
|
|
+使用方式:
|
|
|
|
|
+ # 方式1:直接运行(使用文件顶部的默认路径)
|
|
|
|
|
+ python 03-施工方案筛选.py
|
|
|
|
|
|
|
|
-输出:
|
|
|
|
|
- - 专家评审目录: output/expert_review/ (专家评审PDF)
|
|
|
|
|
- - 公司评审目录: output/company_review/ (公司/集团评审PDF)
|
|
|
|
|
- - 结果记录: output/评审筛选结果记录.xlsx
|
|
|
|
|
- - 统计JSON: output/评审筛选统计.json
|
|
|
|
|
- - 缓存文件: data_pipline/script/temp/评审筛选进度缓存.json
|
|
|
|
|
|
|
+ # 方式2:通过命令行参数指定路径
|
|
|
|
|
+ python 03-施工方案筛选.py --source-dir "E:/data/raw" --expert-output-dir "E:/output/expert" --company-output-dir "E:/output/company"
|
|
|
|
|
+
|
|
|
|
|
+ # 方式3:测试模式(随机抽取5个目录)
|
|
|
|
|
+ python 03-施工方案筛选.py --test-mode --test-sample-size 10
|
|
|
|
|
+
|
|
|
|
|
+ # 方式4:指定并发数和筛选页数
|
|
|
|
|
+ python 03-施工方案筛选.py --workers 8 --phase1-pages 20 --phase2-pages 50
|
|
|
|
|
|
|
|
作者: Claude
|
|
作者: Claude
|
|
|
日期: 2026-04-21
|
|
日期: 2026-04-21
|
|
|
-更新: 2026-05-08 - 增加多进程并发和测试模式
|
|
|
|
|
|
|
+更新: 2026-05-11 - 增加命令行参数支持
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
|
|
+import argparse
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
import json
|
|
import json
|
|
|
import sys
|
|
import sys
|
|
@@ -59,30 +63,16 @@ from docx import Document
|
|
|
warnings.filterwarnings('ignore', category=UserWarning, module='PyPDF2')
|
|
warnings.filterwarnings('ignore', category=UserWarning, module='PyPDF2')
|
|
|
warnings.filterwarnings('ignore', category=Warning)
|
|
warnings.filterwarnings('ignore', category=Warning)
|
|
|
|
|
|
|
|
-# ==================== 路径配置(可在文件首部直接修改)====================
|
|
|
|
|
-# 规则:
|
|
|
|
|
-# 1) 填绝对路径(如 E:/data/raw/670)则直接使用(Windows 建议用 / 或 \\)
|
|
|
|
|
-# 2) 填相对路径(如 ../../raw/670)则相对当前脚本目录解析
|
|
|
|
|
-SOURCE_DIR = r"E:\提供的原始文件\原始文件\全部的原始文档\未提取"
|
|
|
|
|
-EXPERT_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\专家评审意见_记录"
|
|
|
|
|
-COMPANY_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明"
|
|
|
|
|
-TEMP_DIR = "temp"
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-# ==================== 并发与测试配置 ====================
|
|
|
|
|
-# 多进程并发配置
|
|
|
|
|
-NUM_WORKERS = 4 # 并发进程数,建议设为CPU核心数(如CPU有8核则设为6-8)
|
|
|
|
|
- # 注意:每个工作进程内部还会为单个PDF创建子进程(超时控制)
|
|
|
|
|
- # 因此 NUM_WORKERS 不宜过大,避免进程过多导致系统资源耗尽
|
|
|
|
|
-
|
|
|
|
|
-# 测试模式配置
|
|
|
|
|
-TEST_MODE = False # 是否启用测试模式:随机抽取少量目录快速测试
|
|
|
|
|
-TEST_SAMPLE_SIZE = 5 # 测试模式下随机抽取的目录数量
|
|
|
|
|
- # 测试完成后会输出结果并自动退出,不会进入二级筛选
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-# 分批配置(仅用于统计显示,不创建子目录)
|
|
|
|
|
-BATCH_SIZE = 50 # 每批处理的目录数量(仅用于进度显示)
|
|
|
|
|
|
|
+# ==================== 默认路径配置(命令行未指定时使用)====================
|
|
|
|
|
+DEFAULT_SOURCE_DIR = r"E:\提供的原始文件\原始文件\全部的原始文档\未提取"
|
|
|
|
|
+DEFAULT_EXPERT_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\专家评审意见_记录"
|
|
|
|
|
+DEFAULT_COMPANY_OUTPUT_DIR = r"E:\提供的原始文件\原始文件\PDF分类结果_服务器MinerU版\公司集团评审意见说明"
|
|
|
|
|
+DEFAULT_TEMP_DIR = "temp"
|
|
|
|
|
+DEFAULT_NUM_WORKERS = 4
|
|
|
|
|
+DEFAULT_TEST_MODE = False
|
|
|
|
|
+DEFAULT_TEST_SAMPLE_SIZE = 5
|
|
|
|
|
+DEFAULT_PHASE1_PAGES = 15
|
|
|
|
|
+DEFAULT_PHASE2_PAGES = 30
|
|
|
|
|
|
|
|
# 关键词配置
|
|
# 关键词配置
|
|
|
KEYWORDS = {
|
|
KEYWORDS = {
|
|
@@ -272,9 +262,125 @@ KEYWORD_PATTERNS = {
|
|
|
],
|
|
],
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-# 筛选阶段配置
|
|
|
|
|
-PHASE_1_PAGES = 15 # 一级筛选:前15页
|
|
|
|
|
-PHASE_2_PAGES = 30 # 二级筛选:前30页
|
|
|
|
|
|
|
+# 默认筛选阶段配置
|
|
|
|
|
+DEFAULT_PHASE1_PAGES = 15 # 一级筛选:前15页
|
|
|
|
|
+DEFAULT_PHASE2_PAGES = 30 # 二级筛选:前30页
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def parse_args() -> argparse.Namespace:
|
|
|
|
|
+ """解析命令行参数"""
|
|
|
|
|
+ parser = argparse.ArgumentParser(
|
|
|
|
|
+ description="评审意见PDF文件筛选脚本 - 多进程并发版",
|
|
|
|
|
+ formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
|
|
+ epilog="""
|
|
|
|
|
+使用示例:
|
|
|
|
|
+ # 使用默认路径运行
|
|
|
|
|
+ python 03-施工方案筛选.py
|
|
|
|
|
+
|
|
|
|
|
+ # 指定输入输出路径
|
|
|
|
|
+ python 03-施工方案筛选.py --source-dir "E:/data/raw" --expert-output-dir "E:/output/expert" --company-output-dir "E:/output/company"
|
|
|
|
|
+
|
|
|
|
|
+ # 测试模式(随机抽取10个目录)
|
|
|
|
|
+ python 03-施工方案筛选.py --test-mode --test-sample-size 10
|
|
|
|
|
+
|
|
|
|
|
+ # 指定并发数和筛选页数
|
|
|
|
|
+ python 03-施工方案筛选.py --workers 8 --phase1-pages 20 --phase2-pages 50
|
|
|
|
|
+
|
|
|
|
|
+ # 禁用二级筛选
|
|
|
|
|
+ python 03-施工方案筛选.py --disable-phase2
|
|
|
|
|
+
|
|
|
|
|
+ # 重试历史处理失败的目录
|
|
|
|
|
+ python 03-施工方案筛选.py --retry-failed
|
|
|
|
|
+ """
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 路径参数
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--source-dir",
|
|
|
|
|
+ default=DEFAULT_SOURCE_DIR,
|
|
|
|
|
+ help=f"源目录路径(包含数字编号子目录),默认: {DEFAULT_SOURCE_DIR}"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--expert-output-dir",
|
|
|
|
|
+ default=DEFAULT_EXPERT_OUTPUT_DIR,
|
|
|
|
|
+ help=f"专家评审输出目录,默认: {DEFAULT_EXPERT_OUTPUT_DIR}"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--company-output-dir",
|
|
|
|
|
+ default=DEFAULT_COMPANY_OUTPUT_DIR,
|
|
|
|
|
+ help=f"公司/集团评审输出目录,默认: {DEFAULT_COMPANY_OUTPUT_DIR}"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--temp-dir",
|
|
|
|
|
+ default=DEFAULT_TEMP_DIR,
|
|
|
|
|
+ help=f"缓存目录,默认: {DEFAULT_TEMP_DIR}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 筛选参数
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--phase1-pages",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=DEFAULT_PHASE1_PAGES,
|
|
|
|
|
+ help=f"一级筛选检查的页数,默认: {DEFAULT_PHASE1_PAGES}"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--phase2-pages",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=DEFAULT_PHASE2_PAGES,
|
|
|
|
|
+ help=f"二级筛选检查的页数,默认: {DEFAULT_PHASE2_PAGES}"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--disable-phase2",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="禁用二级筛选(仅执行一级筛选)"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 并发参数
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--workers",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=DEFAULT_NUM_WORKERS,
|
|
|
|
|
+ help=f"并发进程数,默认: {DEFAULT_NUM_WORKERS}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 测试模式参数
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--test-mode",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="启用测试模式(随机抽取少量目录快速验证)"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--test-sample-size",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=DEFAULT_TEST_SAMPLE_SIZE,
|
|
|
|
|
+ help=f"测试模式下随机抽取的目录数量,默认: {DEFAULT_TEST_SAMPLE_SIZE}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 其他参数
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--pdf-timeout",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=30,
|
|
|
|
|
+ help="单个PDF解析超时秒数,默认: 30"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--cache-every",
|
|
|
|
|
+ type=int,
|
|
|
|
|
+ default=10,
|
|
|
|
|
+ help="每处理多少个目录保存一次缓存,默认: 10"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--verbose",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="显示详细处理日志(默认仅显示进度)"
|
|
|
|
|
+ )
|
|
|
|
|
+ parser.add_argument(
|
|
|
|
|
+ "--retry-failed",
|
|
|
|
|
+ action="store_true",
|
|
|
|
|
+ help="重试历史处理失败的目录(状态为'处理异常'或'复制失败'的目录)"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ return parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_file_size(file_path: Path) -> int:
|
|
def get_file_size(file_path: Path) -> int:
|
|
@@ -733,10 +839,10 @@ def resolve_config_path(path_value: str, script_dir: Path) -> Path:
|
|
|
def process_directory_worker(args_tuple):
|
|
def process_directory_worker(args_tuple):
|
|
|
"""
|
|
"""
|
|
|
工作进程函数:处理单个目录的两种评审类型
|
|
工作进程函数:处理单个目录的两种评审类型
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
此函数在独立的工作进程中运行,同时处理专家评审和公司评审。
|
|
此函数在独立的工作进程中运行,同时处理专家评审和公司评审。
|
|
|
工作进程之间互不影响,各自独立复制文件到输出目录。
|
|
工作进程之间互不影响,各自独立复制文件到输出目录。
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
Args:
|
|
Args:
|
|
|
args_tuple: (
|
|
args_tuple: (
|
|
|
dir_path_str, # 目录路径字符串
|
|
dir_path_str, # 目录路径字符串
|
|
@@ -746,7 +852,7 @@ def process_directory_worker(args_tuple):
|
|
|
company_output_dir_str, # 公司评审输出目录
|
|
company_output_dir_str, # 公司评审输出目录
|
|
|
verbose # 是否打印详细日志
|
|
verbose # 是否打印详细日志
|
|
|
)
|
|
)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
Returns:
|
|
Returns:
|
|
|
dict: {
|
|
dict: {
|
|
|
"dir_id": str, # 目录ID
|
|
"dir_id": str, # 目录ID
|
|
@@ -756,32 +862,55 @@ def process_directory_worker(args_tuple):
|
|
|
}
|
|
}
|
|
|
"""
|
|
"""
|
|
|
dir_path_str, phase, max_pages, expert_output_dir_str, company_output_dir_str, verbose = args_tuple
|
|
dir_path_str, phase, max_pages, expert_output_dir_str, company_output_dir_str, verbose = args_tuple
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
dir_path = Path(dir_path_str)
|
|
dir_path = Path(dir_path_str)
|
|
|
expert_output_dir = Path(expert_output_dir_str)
|
|
expert_output_dir = Path(expert_output_dir_str)
|
|
|
company_output_dir = Path(company_output_dir_str)
|
|
company_output_dir = Path(company_output_dir_str)
|
|
|
dir_id = dir_path.name
|
|
dir_id = dir_path.name
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
results = []
|
|
results = []
|
|
|
-
|
|
|
|
|
- # 处理专家评审
|
|
|
|
|
- expert_success, expert_file, expert_result = process_review_type(
|
|
|
|
|
- dir_path, phase=phase, max_pages=max_pages,
|
|
|
|
|
- review_type="expert", output_dir=expert_output_dir,
|
|
|
|
|
- dir_id=dir_id, verbose=verbose
|
|
|
|
|
- )
|
|
|
|
|
- if expert_result:
|
|
|
|
|
- results.append(expert_result)
|
|
|
|
|
-
|
|
|
|
|
- # 处理公司评审
|
|
|
|
|
- company_success, company_file, company_result = process_review_type(
|
|
|
|
|
- dir_path, phase=phase, max_pages=max_pages,
|
|
|
|
|
- review_type="company", output_dir=company_output_dir,
|
|
|
|
|
- dir_id=dir_id, verbose=verbose
|
|
|
|
|
- )
|
|
|
|
|
- if company_result:
|
|
|
|
|
- results.append(company_result)
|
|
|
|
|
-
|
|
|
|
|
|
|
+ expert_success = False
|
|
|
|
|
+ company_success = False
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 处理专家评审
|
|
|
|
|
+ expert_success, expert_file, expert_result = process_review_type(
|
|
|
|
|
+ dir_path, phase=phase, max_pages=max_pages,
|
|
|
|
|
+ review_type="expert", output_dir=expert_output_dir,
|
|
|
|
|
+ dir_id=dir_id, verbose=verbose
|
|
|
|
|
+ )
|
|
|
|
|
+ if expert_result:
|
|
|
|
|
+ results.append(expert_result)
|
|
|
|
|
+
|
|
|
|
|
+ # 处理公司评审
|
|
|
|
|
+ company_success, company_file, company_result = process_review_type(
|
|
|
|
|
+ dir_path, phase=phase, max_pages=max_pages,
|
|
|
|
|
+ review_type="company", output_dir=company_output_dir,
|
|
|
|
|
+ dir_id=dir_id, verbose=verbose
|
|
|
|
|
+ )
|
|
|
|
|
+ if company_result:
|
|
|
|
|
+ results.append(company_result)
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ # 捕获异常,记录为处理异常
|
|
|
|
|
+ error_result = {
|
|
|
|
|
+ '目录ID': dir_id,
|
|
|
|
|
+ '评审类型': 'all',
|
|
|
|
|
+ '阶段': f'第{phase}阶段',
|
|
|
|
|
+ '原文件名': '',
|
|
|
|
|
+ '新文件名': '',
|
|
|
|
|
+ '状态': '处理异常',
|
|
|
|
|
+ '匹配来源': '',
|
|
|
|
|
+ '匹配关键词': '',
|
|
|
|
|
+ '匹配文件数': 0,
|
|
|
|
|
+ '文件大小_MB': 0,
|
|
|
|
|
+ '备注': str(e),
|
|
|
|
|
+ '原路径': str(dir_path),
|
|
|
|
|
+ '目标路径': '',
|
|
|
|
|
+ '处理时间': datetime.now().isoformat()
|
|
|
|
|
+ }
|
|
|
|
|
+ results.append(error_result)
|
|
|
|
|
+
|
|
|
return {
|
|
return {
|
|
|
"dir_id": dir_id,
|
|
"dir_id": dir_id,
|
|
|
"expert_success": expert_success,
|
|
"expert_success": expert_success,
|
|
@@ -793,12 +922,13 @@ def process_directory_worker(args_tuple):
|
|
|
def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: int,
|
|
def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: int,
|
|
|
expert_output_dir: Path, company_output_dir: Path,
|
|
expert_output_dir: Path, company_output_dir: Path,
|
|
|
temp_dir: Path, numeric_dirs: List[Path],
|
|
temp_dir: Path, numeric_dirs: List[Path],
|
|
|
- stats: Dict, processed_dirs: Set,
|
|
|
|
|
|
|
+ stats: Dict, processed_dirs: Set,
|
|
|
phase1_no_match_expert: Set, phase1_no_match_company: Set,
|
|
phase1_no_match_expert: Set, phase1_no_match_company: Set,
|
|
|
|
|
+ num_workers: int = DEFAULT_NUM_WORKERS,
|
|
|
verbose: bool = False, cache_every: int = 10) -> Tuple[Dict, Set, Set]:
|
|
verbose: bool = False, cache_every: int = 10) -> Tuple[Dict, Set, Set]:
|
|
|
"""
|
|
"""
|
|
|
并发运行一个阶段的筛选
|
|
并发运行一个阶段的筛选
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
Args:
|
|
Args:
|
|
|
dirs_to_process: 待处理的目录列表
|
|
dirs_to_process: 待处理的目录列表
|
|
|
phase: 阶段(1或2)
|
|
phase: 阶段(1或2)
|
|
@@ -811,16 +941,17 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
|
|
|
processed_dirs: 已处理目录集合(会被修改)
|
|
processed_dirs: 已处理目录集合(会被修改)
|
|
|
phase1_no_match_expert: 一级未匹配专家评审的目录集合(会被修改)
|
|
phase1_no_match_expert: 一级未匹配专家评审的目录集合(会被修改)
|
|
|
phase1_no_match_company: 一级未匹配公司评审的目录集合(会被修改)
|
|
phase1_no_match_company: 一级未匹配公司评审的目录集合(会被修改)
|
|
|
|
|
+ num_workers: 并发进程数
|
|
|
verbose: 工作进程是否打印详细日志
|
|
verbose: 工作进程是否打印详细日志
|
|
|
cache_every: 每处理多少个目录保存一次缓存
|
|
cache_every: 每处理多少个目录保存一次缓存
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
Returns:
|
|
Returns:
|
|
|
(stats, phase1_no_match_expert, phase1_no_match_company)
|
|
(stats, phase1_no_match_expert, phase1_no_match_company)
|
|
|
"""
|
|
"""
|
|
|
total = len(dirs_to_process)
|
|
total = len(dirs_to_process)
|
|
|
completed = 0
|
|
completed = 0
|
|
|
all_results = []
|
|
all_results = []
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 构建参数列表
|
|
# 构建参数列表
|
|
|
args_list = []
|
|
args_list = []
|
|
|
for dir_path in dirs_to_process:
|
|
for dir_path in dirs_to_process:
|
|
@@ -832,21 +963,28 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
|
|
|
str(company_output_dir),
|
|
str(company_output_dir),
|
|
|
verbose
|
|
verbose
|
|
|
))
|
|
))
|
|
|
-
|
|
|
|
|
- print(f"\n 启动 {NUM_WORKERS} 个并发进程处理 {total} 个目录...")
|
|
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n 启动 {num_workers} 个并发进程处理 {total} 个目录...")
|
|
|
print(f" 工作模式: {'详细日志' if verbose else '静默模式(仅显示进度)'}\n")
|
|
print(f" 工作模式: {'详细日志' if verbose else '静默模式(仅显示进度)'}\n")
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 使用进程池并发处理
|
|
# 使用进程池并发处理
|
|
|
- with mp.Pool(processes=NUM_WORKERS) as pool:
|
|
|
|
|
|
|
+ with mp.Pool(processes=num_workers) as pool:
|
|
|
# imap_unordered 不保证顺序,但返回速度最快
|
|
# imap_unordered 不保证顺序,但返回速度最快
|
|
|
for result in pool.imap_unordered(process_directory_worker, args_list):
|
|
for result in pool.imap_unordered(process_directory_worker, args_list):
|
|
|
- dir_id = result["dir_id"]
|
|
|
|
|
-
|
|
|
|
|
|
|
+ dir_id = result.get("dir_id", "unknown")
|
|
|
|
|
+ results_list = result.get("results", [])
|
|
|
|
|
+
|
|
|
|
|
+ # 检查是否有处理异常
|
|
|
|
|
+ has_error = any(r.get("状态") in ["处理异常", "复制失败"] for r in results_list)
|
|
|
|
|
+
|
|
|
# 更新结果列表
|
|
# 更新结果列表
|
|
|
- all_results.extend(result["results"])
|
|
|
|
|
-
|
|
|
|
|
|
|
+ all_results.extend(results_list)
|
|
|
|
|
+
|
|
|
# 更新统计
|
|
# 更新统计
|
|
|
- if result["expert_success"]:
|
|
|
|
|
|
|
+ expert_success = result.get("expert_success", False)
|
|
|
|
|
+ company_success = result.get("company_success", False)
|
|
|
|
|
+
|
|
|
|
|
+ if expert_success:
|
|
|
stats["expert_success_count"] = stats.get("expert_success_count", 0) + 1
|
|
stats["expert_success_count"] = stats.get("expert_success_count", 0) + 1
|
|
|
# 如果之前标记为未匹配,现在成功了,移除标记
|
|
# 如果之前标记为未匹配,现在成功了,移除标记
|
|
|
phase1_no_match_expert.discard(dir_id)
|
|
phase1_no_match_expert.discard(dir_id)
|
|
@@ -854,25 +992,26 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
|
|
|
# 只有在一级筛选时才添加未匹配标记
|
|
# 只有在一级筛选时才添加未匹配标记
|
|
|
if phase == 1:
|
|
if phase == 1:
|
|
|
phase1_no_match_expert.add(dir_id)
|
|
phase1_no_match_expert.add(dir_id)
|
|
|
-
|
|
|
|
|
- if result["company_success"]:
|
|
|
|
|
|
|
+
|
|
|
|
|
+ if company_success:
|
|
|
stats["company_success_count"] = stats.get("company_success_count", 0) + 1
|
|
stats["company_success_count"] = stats.get("company_success_count", 0) + 1
|
|
|
phase1_no_match_company.discard(dir_id)
|
|
phase1_no_match_company.discard(dir_id)
|
|
|
else:
|
|
else:
|
|
|
if phase == 1:
|
|
if phase == 1:
|
|
|
phase1_no_match_company.add(dir_id)
|
|
phase1_no_match_company.add(dir_id)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 标记为已处理
|
|
# 标记为已处理
|
|
|
processed_dirs.add(dir_id)
|
|
processed_dirs.add(dir_id)
|
|
|
completed += 1
|
|
completed += 1
|
|
|
-
|
|
|
|
|
- # 显示进度
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 显示进度(如果有异常则显示警告)
|
|
|
progress = completed / total * 100
|
|
progress = completed / total * 100
|
|
|
|
|
+ status_indicator = "⚠️" if has_error else ""
|
|
|
print(f"\r 进度: {completed}/{total} ({progress:.1f}%) | "
|
|
print(f"\r 进度: {completed}/{total} ({progress:.1f}%) | "
|
|
|
f"专家成功: {stats.get('expert_success_count', 0)} | "
|
|
f"专家成功: {stats.get('expert_success_count', 0)} | "
|
|
|
f"公司成功: {stats.get('company_success_count', 0)} | "
|
|
f"公司成功: {stats.get('company_success_count', 0)} | "
|
|
|
- f"当前: {dir_id}", end="", flush=True)
|
|
|
|
|
-
|
|
|
|
|
|
|
+ f"当前: {dir_id} {status_indicator}", end="", flush=True)
|
|
|
|
|
+
|
|
|
# 定期保存缓存
|
|
# 定期保存缓存
|
|
|
if completed % cache_every == 0 or completed == total:
|
|
if completed % cache_every == 0 or completed == total:
|
|
|
cache_data = {
|
|
cache_data = {
|
|
@@ -885,7 +1024,7 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
|
|
|
"total_directories": len(numeric_dirs)
|
|
"total_directories": len(numeric_dirs)
|
|
|
}
|
|
}
|
|
|
save_progress_cache(temp_dir, cache_data)
|
|
save_progress_cache(temp_dir, cache_data)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
print(f"\n\n ✅ 阶段完成!处理 {completed} 个目录")
|
|
print(f"\n\n ✅ 阶段完成!处理 {completed} 个目录")
|
|
|
print(f" 专家评审成功: {stats.get('expert_success_count', 0)} 个")
|
|
print(f" 专家评审成功: {stats.get('expert_success_count', 0)} 个")
|
|
|
print(f" 公司评审成功: {stats.get('company_success_count', 0)} 个")
|
|
print(f" 公司评审成功: {stats.get('company_success_count', 0)} 个")
|
|
@@ -895,28 +1034,44 @@ def run_phase_concurrently(dirs_to_process: List[Path], phase: int, max_pages: i
|
|
|
|
|
|
|
|
def main():
|
|
def main():
|
|
|
"""主函数"""
|
|
"""主函数"""
|
|
|
|
|
+ # 解析命令行参数
|
|
|
|
|
+ args = parse_args()
|
|
|
|
|
+
|
|
|
print("=" * 70)
|
|
print("=" * 70)
|
|
|
- print("评审意见PDF筛选脚本 - 多进程并发版+测试模式")
|
|
|
|
|
|
|
+ print("评审意见PDF筛选脚本 - 多进程并发版+测试模式+命令行参数")
|
|
|
print("=" * 70)
|
|
print("=" * 70)
|
|
|
|
|
|
|
|
- # 按文件首部配置组装路径(不再按项目根目录拼接)
|
|
|
|
|
|
|
+ # 按参数组装路径
|
|
|
script_dir = Path(__file__).parent
|
|
script_dir = Path(__file__).parent
|
|
|
- source_base_dir = resolve_config_path(SOURCE_DIR, script_dir)
|
|
|
|
|
- expert_output_dir = resolve_config_path(EXPERT_OUTPUT_DIR, script_dir)
|
|
|
|
|
- company_output_dir = resolve_config_path(COMPANY_OUTPUT_DIR, script_dir)
|
|
|
|
|
- temp_dir = resolve_config_path(TEMP_DIR, script_dir)
|
|
|
|
|
|
|
+ source_base_dir = resolve_config_path(args.source_dir, script_dir)
|
|
|
|
|
+ expert_output_dir = resolve_config_path(args.expert_output_dir, script_dir)
|
|
|
|
|
+ company_output_dir = resolve_config_path(args.company_output_dir, script_dir)
|
|
|
|
|
+ temp_dir = resolve_config_path(args.temp_dir, script_dir)
|
|
|
|
|
+
|
|
|
|
|
+ # 从参数获取配置
|
|
|
|
|
+ num_workers = args.workers
|
|
|
|
|
+ test_mode = args.test_mode
|
|
|
|
|
+ test_sample_size = args.test_sample_size
|
|
|
|
|
+ phase1_pages = args.phase1_pages
|
|
|
|
|
+ phase2_pages = args.phase2_pages
|
|
|
|
|
+ disable_phase2 = args.disable_phase2
|
|
|
|
|
+ pdf_timeout = args.pdf_timeout
|
|
|
|
|
+ cache_every = args.cache_every
|
|
|
|
|
+ verbose_mode = args.verbose
|
|
|
|
|
+ retry_failed = args.retry_failed
|
|
|
|
|
|
|
|
print(f"\n【配置信息】")
|
|
print(f"\n【配置信息】")
|
|
|
print(f" 源目录: {source_base_dir}")
|
|
print(f" 源目录: {source_base_dir}")
|
|
|
print(f" 专家评审输出目录: {expert_output_dir}")
|
|
print(f" 专家评审输出目录: {expert_output_dir}")
|
|
|
print(f" 公司评审输出目录: {company_output_dir}")
|
|
print(f" 公司评审输出目录: {company_output_dir}")
|
|
|
print(f" 缓存目录: {temp_dir}")
|
|
print(f" 缓存目录: {temp_dir}")
|
|
|
- print(f" 并发进程数: {NUM_WORKERS}")
|
|
|
|
|
- print(f" 测试模式: {'是(抽取5个目录)' if TEST_MODE else '否'}")
|
|
|
|
|
- print(f" 专家评审关键词: {KEYWORDS['expert']}")
|
|
|
|
|
- print(f" 公司评审关键词: {KEYWORDS['company']}")
|
|
|
|
|
- print(f" 一级筛选: 前{PHASE_1_PAGES}页")
|
|
|
|
|
- print(f" 二级筛选: 前{PHASE_2_PAGES}页")
|
|
|
|
|
|
|
+ print(f" 并发进程数: {num_workers}")
|
|
|
|
|
+ print(f" 测试模式: {'是(抽取{}个目录)'.format(test_sample_size) if test_mode else '否'}")
|
|
|
|
|
+ print(f" 一级筛选: 前{phase1_pages}页")
|
|
|
|
|
+ print(f" 二级筛选: {'禁用' if disable_phase2 else '前{}页'.format(phase2_pages)}")
|
|
|
|
|
+ print(f" PDF超时秒数: {pdf_timeout}")
|
|
|
|
|
+ print(f" 详细日志: {'是' if verbose_mode else '否'}")
|
|
|
|
|
+ print(f" 失败重试: {'是' if retry_failed else '否'}")
|
|
|
|
|
|
|
|
# 检查 PyCryptodome 库
|
|
# 检查 PyCryptodome 库
|
|
|
try:
|
|
try:
|
|
@@ -936,19 +1091,19 @@ def main():
|
|
|
sys.exit(1)
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
print(f" 找到 {len(numeric_dirs)} 个数字编号子目录")
|
|
print(f" 找到 {len(numeric_dirs)} 个数字编号子目录")
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# ==================== 测试模式:随机抽取 ====================
|
|
# ==================== 测试模式:随机抽取 ====================
|
|
|
- if TEST_MODE:
|
|
|
|
|
- print(f"\n【测试模式】随机抽取 {TEST_SAMPLE_SIZE} 个目录进行测试...")
|
|
|
|
|
- if len(numeric_dirs) <= TEST_SAMPLE_SIZE:
|
|
|
|
|
|
|
+ if test_mode:
|
|
|
|
|
+ print(f"\n【测试模式】随机抽取 {test_sample_size} 个目录进行测试...")
|
|
|
|
|
+ if len(numeric_dirs) <= test_sample_size:
|
|
|
test_dirs = numeric_dirs
|
|
test_dirs = numeric_dirs
|
|
|
- print(f" 目录总数不足 {TEST_SAMPLE_SIZE},测试全部 {len(numeric_dirs)} 个目录")
|
|
|
|
|
|
|
+ print(f" 目录总数不足 {test_sample_size},测试全部 {len(numeric_dirs)} 个目录")
|
|
|
else:
|
|
else:
|
|
|
# 使用固定随机种子,确保可复现
|
|
# 使用固定随机种子,确保可复现
|
|
|
random.seed(42)
|
|
random.seed(42)
|
|
|
- test_dirs = random.sample(numeric_dirs, TEST_SAMPLE_SIZE)
|
|
|
|
|
|
|
+ test_dirs = random.sample(numeric_dirs, test_sample_size)
|
|
|
test_dirs.sort(key=lambda d: int(d.name)) # 按数字排序,方便查看
|
|
test_dirs.sort(key=lambda d: int(d.name)) # 按数字排序,方便查看
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
numeric_dirs = test_dirs
|
|
numeric_dirs = test_dirs
|
|
|
print(f" 测试目录: {[d.name for d in test_dirs]}")
|
|
print(f" 测试目录: {[d.name for d in test_dirs]}")
|
|
|
# 测试模式不加载缓存,不进入二级筛选
|
|
# 测试模式不加载缓存,不进入二级筛选
|
|
@@ -965,26 +1120,46 @@ def main():
|
|
|
# 加载缓存(断点续传)
|
|
# 加载缓存(断点续传)
|
|
|
print(f"\n【步骤 2/6】加载进度缓存...")
|
|
print(f"\n【步骤 2/6】加载进度缓存...")
|
|
|
cache = load_progress_cache(temp_dir)
|
|
cache = load_progress_cache(temp_dir)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
processed_dirs = set(cache.get("processed_dirs", []))
|
|
processed_dirs = set(cache.get("processed_dirs", []))
|
|
|
phase1_no_match_expert = set(cache.get("phase1_no_match_expert", []))
|
|
phase1_no_match_expert = set(cache.get("phase1_no_match_expert", []))
|
|
|
phase1_no_match_company = set(cache.get("phase1_no_match_company", []))
|
|
phase1_no_match_company = set(cache.get("phase1_no_match_company", []))
|
|
|
current_phase = cache.get("phase", 1)
|
|
current_phase = cache.get("phase", 1)
|
|
|
all_results = cache.get("results", [])
|
|
all_results = cache.get("results", [])
|
|
|
|
|
|
|
|
- if processed_dirs and not TEST_MODE:
|
|
|
|
|
|
|
+ # ==================== 失败重试机制 ====================
|
|
|
|
|
+ # 从历史结果中识别失败的目录
|
|
|
|
|
+ failed_dir_ids = set()
|
|
|
|
|
+ if all_results:
|
|
|
|
|
+ for result in all_results:
|
|
|
|
|
+ status = result.get("状态", "")
|
|
|
|
|
+ if status in ["处理异常", "复制失败", "读取失败", "解析失败"]:
|
|
|
|
|
+ failed_dir_ids.add(str(result.get("目录ID", "")))
|
|
|
|
|
+
|
|
|
|
|
+ if retry_failed and failed_dir_ids:
|
|
|
|
|
+ print(f"\n【失败重试】发现 {len(failed_dir_ids)} 个历史失败的目录:")
|
|
|
|
|
+ print(f" 失败目录ID: {sorted(list(failed_dir_ids), key=lambda x: int(x) if x.isdigit() else 0)[:10]}{'...' if len(failed_dir_ids) > 10 else ''}")
|
|
|
|
|
+ # 将失败目录从已处理列表中移除,使其重新被处理
|
|
|
|
|
+ processed_dirs = {d for d in processed_dirs if d not in failed_dir_ids}
|
|
|
|
|
+ # 同时从结果列表中移除失败记录,避免重复
|
|
|
|
|
+ all_results = [r for r in all_results if str(r.get("目录ID", "")) not in failed_dir_ids]
|
|
|
|
|
+ print(f" 已将失败目录重新加入待处理列表")
|
|
|
|
|
+
|
|
|
|
|
+ if processed_dirs and not test_mode:
|
|
|
print(f" 发现缓存:")
|
|
print(f" 发现缓存:")
|
|
|
print(f" - 已处理: {len(processed_dirs)} 个目录")
|
|
print(f" - 已处理: {len(processed_dirs)} 个目录")
|
|
|
print(f" - 专家评审一级未找到: {len(phase1_no_match_expert)} 个目录")
|
|
print(f" - 专家评审一级未找到: {len(phase1_no_match_expert)} 个目录")
|
|
|
print(f" - 公司评审一级未找到: {len(phase1_no_match_company)} 个目录")
|
|
print(f" - 公司评审一级未找到: {len(phase1_no_match_company)} 个目录")
|
|
|
print(f" - 当前阶段: 第{current_phase}阶段")
|
|
print(f" - 当前阶段: 第{current_phase}阶段")
|
|
|
|
|
+ if failed_dir_ids and not retry_failed:
|
|
|
|
|
+ print(f" - 历史失败目录: {len(failed_dir_ids)} 个(可通过 --retry-failed 参数重试)")
|
|
|
else:
|
|
else:
|
|
|
- print(f" {'无缓存(测试模式),将从头开始处理' if TEST_MODE else '无缓存,将从头开始处理'}")
|
|
|
|
|
|
|
+ print(f" {'无缓存(测试模式),将从头开始处理' if test_mode else '无缓存,将从头开始处理'}")
|
|
|
|
|
|
|
|
# ==================== 一级筛选 ====================
|
|
# ==================== 一级筛选 ====================
|
|
|
if current_phase == 1:
|
|
if current_phase == 1:
|
|
|
print(f"\n{'='*70}")
|
|
print(f"\n{'='*70}")
|
|
|
- print("【第1阶段】一级筛选(检查前15页)")
|
|
|
|
|
|
|
+ print(f"【第1阶段】一级筛选(检查前{phase1_pages}页)")
|
|
|
print('='*70)
|
|
print('='*70)
|
|
|
|
|
|
|
|
dirs_to_process = [d for d in numeric_dirs if d.name not in processed_dirs]
|
|
dirs_to_process = [d for d in numeric_dirs if d.name not in processed_dirs]
|
|
@@ -1017,19 +1192,20 @@ def main():
|
|
|
stats[key] = value
|
|
stats[key] = value
|
|
|
|
|
|
|
|
# 并发处理一级筛选
|
|
# 并发处理一级筛选
|
|
|
- # 在测试模式下使用详细日志(verbose=True),正式运行使用静默模式(verbose=False)
|
|
|
|
|
- verbose_mode = TEST_MODE # 测试模式打印详细日志,正式模式静默
|
|
|
|
|
|
|
+ # 测试模式或 verbose 参数启用时打印详细日志
|
|
|
|
|
+ phase1_verbose = test_mode or verbose_mode
|
|
|
stats, phase1_no_match_expert, phase1_no_match_company = run_phase_concurrently(
|
|
stats, phase1_no_match_expert, phase1_no_match_company = run_phase_concurrently(
|
|
|
- dirs_to_process, phase=1, max_pages=PHASE_1_PAGES,
|
|
|
|
|
|
|
+ dirs_to_process, phase=1, max_pages=phase1_pages,
|
|
|
expert_output_dir=expert_output_dir, company_output_dir=company_output_dir,
|
|
expert_output_dir=expert_output_dir, company_output_dir=company_output_dir,
|
|
|
temp_dir=temp_dir, numeric_dirs=numeric_dirs,
|
|
temp_dir=temp_dir, numeric_dirs=numeric_dirs,
|
|
|
stats=stats, processed_dirs=processed_dirs,
|
|
stats=stats, processed_dirs=processed_dirs,
|
|
|
phase1_no_match_expert=phase1_no_match_expert,
|
|
phase1_no_match_expert=phase1_no_match_expert,
|
|
|
phase1_no_match_company=phase1_no_match_company,
|
|
phase1_no_match_company=phase1_no_match_company,
|
|
|
- verbose=verbose_mode,
|
|
|
|
|
- cache_every=10
|
|
|
|
|
|
|
+ num_workers=num_workers,
|
|
|
|
|
+ verbose=phase1_verbose,
|
|
|
|
|
+ cache_every=cache_every
|
|
|
)
|
|
)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 更新结果列表
|
|
# 更新结果列表
|
|
|
cache = load_progress_cache(temp_dir)
|
|
cache = load_progress_cache(temp_dir)
|
|
|
all_results = cache.get("results", [])
|
|
all_results = cache.get("results", [])
|
|
@@ -1041,7 +1217,7 @@ def main():
|
|
|
print(f" 公司评审未找到: {len(phase1_no_match_company)} 个")
|
|
print(f" 公司评审未找到: {len(phase1_no_match_company)} 个")
|
|
|
|
|
|
|
|
# 测试模式下直接退出,不进行二级筛选和保存
|
|
# 测试模式下直接退出,不进行二级筛选和保存
|
|
|
- if TEST_MODE:
|
|
|
|
|
|
|
+ if test_mode:
|
|
|
print(f"\n{'='*70}")
|
|
print(f"\n{'='*70}")
|
|
|
print("【测试模式完成】")
|
|
print("【测试模式完成】")
|
|
|
print(f" 共测试 {len(dirs_to_process)} 个目录")
|
|
print(f" 共测试 {len(dirs_to_process)} 个目录")
|
|
@@ -1049,7 +1225,7 @@ def main():
|
|
|
print(f" 公司评审成功: {stats['company_success_count']} 个")
|
|
print(f" 公司评审成功: {stats['company_success_count']} 个")
|
|
|
print(f" 测试结果已保存到缓存,可查看输出目录确认文件")
|
|
print(f" 测试结果已保存到缓存,可查看输出目录确认文件")
|
|
|
print("="*70)
|
|
print("="*70)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 测试模式也保存最终结果
|
|
# 测试模式也保存最终结果
|
|
|
_save_final_results(
|
|
_save_final_results(
|
|
|
temp_dir, expert_output_dir, company_output_dir,
|
|
temp_dir, expert_output_dir, company_output_dir,
|
|
@@ -1057,51 +1233,62 @@ def main():
|
|
|
)
|
|
)
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
- # 询问是否进行二级筛选
|
|
|
|
|
- total_no_match = len(phase1_no_match_expert.union(phase1_no_match_company))
|
|
|
|
|
- print(f"\n{'='*70}")
|
|
|
|
|
- print("是否进行二级筛选?")
|
|
|
|
|
- print(f" - 专家评审未找到: {len(phase1_no_match_expert)} 个目录")
|
|
|
|
|
- print(f" - 公司评审未找到: {len(phase1_no_match_company)} 个目录")
|
|
|
|
|
- print(f" - 扩大检查范围到前{PHASE_2_PAGES}页")
|
|
|
|
|
- print('='*70)
|
|
|
|
|
-
|
|
|
|
|
- while True:
|
|
|
|
|
- try:
|
|
|
|
|
- user_input = input("请输入 (y/n): ").strip().lower()
|
|
|
|
|
- if user_input in ['y', 'yes', '是']:
|
|
|
|
|
- enable_phase2 = True
|
|
|
|
|
- break
|
|
|
|
|
- elif user_input in ['n', 'no', '否']:
|
|
|
|
|
- enable_phase2 = False
|
|
|
|
|
- break
|
|
|
|
|
- else:
|
|
|
|
|
- print(" 请输入 y 或 n")
|
|
|
|
|
- except KeyboardInterrupt:
|
|
|
|
|
- print("\n\n用户中断,进度已保存")
|
|
|
|
|
- sys.exit(0)
|
|
|
|
|
-
|
|
|
|
|
- if not enable_phase2:
|
|
|
|
|
|
|
+ # 判断是否进行二级筛选
|
|
|
|
|
+ # 如果参数指定禁用二级筛选,则直接跳过
|
|
|
|
|
+ if disable_phase2:
|
|
|
|
|
+ print(f"\n{'='*70}")
|
|
|
|
|
+ print("二级筛选已禁用(--disable-phase2 参数)")
|
|
|
|
|
+ print(f" - 专家评审未找到: {len(phase1_no_match_expert)} 个目录")
|
|
|
|
|
+ print(f" - 公司评审未找到: {len(phase1_no_match_company)} 个目录")
|
|
|
|
|
+ print('='*70)
|
|
|
print("\n 跳过二级筛选,直接保存结果...")
|
|
print("\n 跳过二级筛选,直接保存结果...")
|
|
|
- current_phase = 3 # 跳过二级筛选,直接保存
|
|
|
|
|
|
|
+ current_phase = 3
|
|
|
else:
|
|
else:
|
|
|
- current_phase = 2
|
|
|
|
|
- # 保存进入第二阶段的标记
|
|
|
|
|
- cache_data = {
|
|
|
|
|
- "processed_dirs": list(processed_dirs),
|
|
|
|
|
- "phase1_no_match_expert": list(phase1_no_match_expert),
|
|
|
|
|
- "phase1_no_match_company": list(phase1_no_match_company),
|
|
|
|
|
- "results": all_results,
|
|
|
|
|
- "stats": stats,
|
|
|
|
|
- "phase": 2,
|
|
|
|
|
- "total_directories": len(numeric_dirs)
|
|
|
|
|
- }
|
|
|
|
|
- save_progress_cache(temp_dir, cache_data)
|
|
|
|
|
|
|
+ # 询问是否进行二级筛选
|
|
|
|
|
+ total_no_match = len(phase1_no_match_expert.union(phase1_no_match_company))
|
|
|
|
|
+ print(f"\n{'='*70}")
|
|
|
|
|
+ print("是否进行二级筛选?")
|
|
|
|
|
+ print(f" - 专家评审未找到: {len(phase1_no_match_expert)} 个目录")
|
|
|
|
|
+ print(f" - 公司评审未找到: {len(phase1_no_match_company)} 个目录")
|
|
|
|
|
+ print(f" - 扩大检查范围到前{phase2_pages}页")
|
|
|
|
|
+ print('='*70)
|
|
|
|
|
+
|
|
|
|
|
+ while True:
|
|
|
|
|
+ try:
|
|
|
|
|
+ user_input = input("请输入 (y/n): ").strip().lower()
|
|
|
|
|
+ if user_input in ['y', 'yes', '是']:
|
|
|
|
|
+ enable_phase2 = True
|
|
|
|
|
+ break
|
|
|
|
|
+ elif user_input in ['n', 'no', '否']:
|
|
|
|
|
+ enable_phase2 = False
|
|
|
|
|
+ break
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(" 请输入 y 或 n")
|
|
|
|
|
+ except KeyboardInterrupt:
|
|
|
|
|
+ print("\n\n用户中断,进度已保存")
|
|
|
|
|
+ sys.exit(0)
|
|
|
|
|
+
|
|
|
|
|
+ if not enable_phase2:
|
|
|
|
|
+ print("\n 跳过二级筛选,直接保存结果...")
|
|
|
|
|
+ current_phase = 3 # 跳过二级筛选,直接保存
|
|
|
|
|
+ else:
|
|
|
|
|
+ current_phase = 2
|
|
|
|
|
+ # 保存进入第二阶段的标记
|
|
|
|
|
+ cache_data = {
|
|
|
|
|
+ "processed_dirs": list(processed_dirs),
|
|
|
|
|
+ "phase1_no_match_expert": list(phase1_no_match_expert),
|
|
|
|
|
+ "phase1_no_match_company": list(phase1_no_match_company),
|
|
|
|
|
+ "results": all_results,
|
|
|
|
|
+ "stats": stats,
|
|
|
|
|
+ "phase": 2,
|
|
|
|
|
+ "total_directories": len(numeric_dirs)
|
|
|
|
|
+ }
|
|
|
|
|
+ save_progress_cache(temp_dir, cache_data)
|
|
|
|
|
|
|
|
# ==================== 二级筛选 ====================
|
|
# ==================== 二级筛选 ====================
|
|
|
if current_phase == 2:
|
|
if current_phase == 2:
|
|
|
print(f"\n{'='*70}")
|
|
print(f"\n{'='*70}")
|
|
|
- print("【第2阶段】二级筛选(检查前30页)")
|
|
|
|
|
|
|
+ print(f"【第2阶段】二级筛选(检查前{phase2_pages}页)")
|
|
|
print('='*70)
|
|
print('='*70)
|
|
|
|
|
|
|
|
# 重新加载以获取最新状态
|
|
# 重新加载以获取最新状态
|
|
@@ -1131,18 +1318,19 @@ def main():
|
|
|
# 但由于 process_directory_worker 会同时处理两种类型,
|
|
# 但由于 process_directory_worker 会同时处理两种类型,
|
|
|
# 已成功的类型会再次被处理(但结果相同,不会重复复制因为文件名相同会覆盖)
|
|
# 已成功的类型会再次被处理(但结果相同,不会重复复制因为文件名相同会覆盖)
|
|
|
# 为了效率,我们只处理有未匹配的目录
|
|
# 为了效率,我们只处理有未匹配的目录
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
stats, phase1_no_match_expert, phase1_no_match_company = run_phase_concurrently(
|
|
stats, phase1_no_match_expert, phase1_no_match_company = run_phase_concurrently(
|
|
|
- all_phase2_dirs, phase=2, max_pages=PHASE_2_PAGES,
|
|
|
|
|
|
|
+ all_phase2_dirs, phase=2, max_pages=phase2_pages,
|
|
|
expert_output_dir=expert_output_dir, company_output_dir=company_output_dir,
|
|
expert_output_dir=expert_output_dir, company_output_dir=company_output_dir,
|
|
|
temp_dir=temp_dir, numeric_dirs=numeric_dirs,
|
|
temp_dir=temp_dir, numeric_dirs=numeric_dirs,
|
|
|
stats=stats, processed_dirs=processed_dirs,
|
|
stats=stats, processed_dirs=processed_dirs,
|
|
|
phase1_no_match_expert=phase1_no_match_expert,
|
|
phase1_no_match_expert=phase1_no_match_expert,
|
|
|
phase1_no_match_company=phase1_no_match_company,
|
|
phase1_no_match_company=phase1_no_match_company,
|
|
|
- verbose=False, # 二级筛选使用静默模式
|
|
|
|
|
- cache_every=10
|
|
|
|
|
|
|
+ num_workers=num_workers,
|
|
|
|
|
+ verbose=verbose_mode,
|
|
|
|
|
+ cache_every=cache_every
|
|
|
)
|
|
)
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
# 更新结果
|
|
# 更新结果
|
|
|
cache = load_progress_cache(temp_dir)
|
|
cache = load_progress_cache(temp_dir)
|
|
|
all_results = cache.get("results", [])
|
|
all_results = cache.get("results", [])
|