SHA1
--- a/.design/施工方案审查应用/基础架构设计/施工审查应用基础架构设计.md
+++ b/.design/施工方案审查应用/基础架构设计/施工审查应用基础架构设计.md
@@ -0,0 +1,1326 @@
 
				+# LQAgentPlatform 最终架构设计
			
 
				+
			
 
				+> **版本**: v3.0
			
 
				+> **更新日期**: 2026-04-09
			
 
				+> **项目状态**: 核心功能已实现，生产环境运行中
			
 
				+> **v2.0→v3.0 变更**: 新增施工方案编写模块、脱敏模块、简化管道、标准匹配、LLM分类器v2、LLM链式客户端框架
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 最终架构设计图
			
 
				+
			
 
				+```
			
 
				+┌─────────────────────────────────────────────────────────────────────────────────────┐
			
 
				+│                         LQAgentPlatform - 实际架构实现 (v2.0)                       │
			
 
				+├─────────────────────────────────────────────────────────────────────────────────────┤
			
 
				+│                                                                                     │
			
 
				+│  ┌───────────────────────────────────────────────────────────────────────────────┐  │
			
 
				+│  │                           Server Layer (服务器层)                              │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  FastAPI Application (server/app.py)                                    │  │  │
			
 
				+│  │  │  • RouteManager - 路由管理                                               │  │  │
			
 
				+│  │  │  • CeleryWorkerManager - 异步任务管理                                     │  │  │
			
 
				+│  │  │  • ApplicationFactory - 应用工厂                                          │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  └───────────────────────────────────────────────────────────────────────────────┘  │
			
 
				+│                                        ▲                                            │
			
 
				+│                                        │ HTTP Request                                │
			
 
				+│                                        ▼                                            │
			
 
				+│  ┌───────────────────────────────────────────────────────────────────────────────┐  │
			
 
				+│  │                            Views Layer (视图层)                               │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  construction_review/ - 施工方案审查API ✅ (已实现)                        │  │  │
			
 
				+│  │  │  • file_upload.py - 文档上传接口                                          │  │  │
			
 
				+│  │  │  • launch_review.py - 启动审查接口                                        │  │  │
			
 
				+│  │  │  • review_results.py - 审查结果接口                                       │  │  │
			
 
				+│  │  │  • task_control.py - 任务控制接口                                         │  │  │
			
 
				+│  │  │  • desensitize_api.py - 脱敏API接口 🆕                                   │  │  │
			
 
				+│  │  │  • schemas/error_schemas.py - 错误模式定义                                │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  construction_write/ - 施工方案编写API ✅ (已实现) 🆕                     │  │  │
			
 
				+│  │  │  • outline_views.py - 大纲生成接口                                       │  │  │
			
 
				+│  │  │  • content_completion.py - 内容补全接口                                   │  │  │
			
 
				+│  │  │  • regenerate_views.py - 重新生成接口                                    │  │  │
			
 
				+│  │  │  • task_cancel_views.py - 任务取消接口                                   │  │  │
			
 
				+│  │  │  • similar_plan_recommend.py - 相似方案推荐接口                           │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  test_views.py - 测试接口                                                │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  └───────────────────────────────────────────────────────────────────────────────┘  │
			
 
				+│                                        ▲                                            │
			
 
				+│                                        │ 调用业务逻辑                                │
			
 
				+│                                        ▼                                            │
			
 
				+│  ┌───────────────────────────────────────────────────────────────────────────────┐  │
			
 
				+│  │                           Core Layer (核心业务层)                             │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  construction_review/ - 施工方案审查模块 ✅ (已实现)                       │  │  │
			
 
				+│  │  │  ┌─────────────────────────────────────────────────────────────────┐    │  │  │
			
 
				+│  │  │  │  workflows/ - 工作流层                                          │    │  │  │
			
 
				+│  │  │  │  • ai_review_workflow.py - AI审查工作流 (54KB, 核心)             │    │  │  │
			
 
				+│  │  │  │  • document_workflow.py - 文档处理工作流                         │    │  │  │
			
 
				+│  │  │  │  • report_workflow.py - 报告生成工作流                           │    │  │  │
			
 
				+│  │  │  │  • core_functions/ - 核心业务函数 🆕                             │    │  │  │
			
 
				+│  │  │  │  • types/ - 状态类型定义 (AIReviewState, TaskChainState) 🆕      │    │  │  │
			
 
				+│  │  │  └─────────────────────────────────────────────────────────────────┘    │  │  │
			
 
				+│  │  │  ┌─────────────────────────────────────────────────────────────────┐    │  │  │
			
 
				+│  │  │  │  component/ - 组件层                                            │    │  │  │
			
 
				+│  │  │  │  • ai_review_engine.py - AI审查引擎 (核心引擎)                   │    │  │  │
			
 
				+│  │  │  │  • document_processor.py - 文档处理器                            │    │  │  │
			
 
				+│  │  │  │  • report_generator.py - 报告生成器                              │    │  │  │
			
 
				+│  │  │  │  • outline_catalogue_matcher.py - 大纲目录匹配器 🆕               │    │  │  │
			
 
				+│  │  │  │  • constants.py - 常量定义 🆕                                   │    │  │  │
			
 
				+│  │  │  │  • check_completeness/ - 完整性检查组件                          │    │  │  │
			
 
				+│  │  │  │  • desensitize/ - 数据脱敏模块 🆕                                │    │  │  │
			
 
				+│  │  │  │    - engine.py, validator.py, dict_manager.py                   │    │  │  │
			
 
				+│  │  │  │    - model_client.py, remapper.py                               │    │  │  │
			
 
				+│  │  │  │    - processors/ (pii, geo, biz, financial)                     │    │  │  │
			
 
				+│  │  │  │  • doc_worker/ - 文档处理工作器                                  │    │  │  │
			
 
				+│  │  │  │    - classification/ (chunk, hierarchy, smart_local) 🆕         │    │  │  │
			
 
				+│  │  │  │    - pdf_worker/ (html_to_markdown) 🆕                          │    │  │  │
			
 
				+│  │  │  │    - config/ (StandardCategoryTable.csv, prompt.yaml)           │    │  │  │
			
 
				+│  │  │  │  • minimal_pipeline/ - 简化处理管道 🆕                            │    │  │  │
			
 
				+│  │  │  │    - ocr_processor.py, pdf_extractor.py                        │    │  │  │
			
 
				+│  │  │  │    - toc_builder.py, toc_detector.py                           │    │  │  │
			
 
				+│  │  │  │    - simple_processor.py, chunk_assembler.py                    │    │  │  │
			
 
				+│  │  │  │  • standard_matching/ - 标准匹配模块 🆕                          │    │  │  │
			
 
				+│  │  │  │    - standard_dao.py, standard_service.py                      │    │  │  │
			
 
				+│  │  │  │  • infrastructure/ - 基础设施组件                                │    │  │  │
			
 
				+│  │  │  │  • reviewers/ - 审查器集合                                       │    │  │  │
			
 
				+│  │  │  │    - base_reviewer.py - 基础审查器                               │    │  │  │
			
 
				+│  │  │  │    - completeness_reviewer.py - 完整性审查器                     │    │  │  │
			
 
				+│  │  │  │    - reference_basis_reviewer.py - 参考依据审查器               │    │  │  │
			
 
				+│  │  │  │    - standard_timeliness_reviewer.py - 标准时效性审查器 🆕       │    │  │  │
			
 
				+│  │  │  │    - timeliness_basis_reviewer.py - 时效性审查器                │    │  │  │
			
 
				+│  │  │  │    - timeliness_content_reviewer.py - 时效性内容审查器 🆕        │    │  │  │
			
 
				+│  │  │  │    - semantic_logic.py - 语义逻辑审查器                         │    │  │  │
			
 
				+│  │  │  │    - sensitive_word_check.py - 敏感词检查 🆕                    │    │  │  │
			
 
				+│  │  │  │    - utils/llm_content_classifier_v2/ - LLM分类器v2 🆕          │    │  │  │
			
 
				+│  │  │  │    - utils/llm_chain_client/ - LLM链式客户端框架 🆕             │    │  │  │
			
 
				+│  │  │  └─────────────────────────────────────────────────────────────────┘    │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  construction_write/ - 施工方案编写模块 ✅ (已实现) 🆕                    │  │  │
			
 
				+│  │  │  ┌─────────────────────────────────────────────────────────────────┐    │  │  │
			
 
				+│  │  │  │  component/                                                     │    │  │  │
			
 
				+│  │  │  │  • outline_generator.py - 大纲生成器                             │    │  │  │
			
 
				+│  │  │  │  • state_models.py - 状态模型                                   │    │  │  │
			
 
				+│  │  │  │  • prompt/ - 提示词配置                                         │    │  │  │
			
 
				+│  │  │  └─────────────────────────────────────────────────────────────────┘    │  │  │
			
 
				+│  │  │  ┌─────────────────────────────────────────────────────────────────┐    │  │  │
			
 
				+│  │  │  │  workflows/                                                     │    │  │  │
			
 
				+│  │  │  │  • agent.py - 编写智能体                                        │    │  │  │
			
 
				+│  │  │  │  • outline_workflow.py - 大纲工作流                              │    │  │  │
			
 
				+│  │  │  └─────────────────────────────────────────────────────────────────┘    │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  base/ - 基础组件 ✅ (已实现)                                           │  │  │
			
 
				+│  │  │  • progress_manager.py - 进度管理器                                     │  │  │
			
 
				+│  │  │  • workflow_manager.py - 工作流管理器                                   │  │  │
			
 
				+│  │  │  • sse_manager.py - SSE服务器推送事件管理器                             │  │  │
			
 
				+│  │  │  • redis_duplicate_checker.py - Redis去重检查器                         │  │  │
			
 
				+│  │  │  • task_models.py - 任务模型定义                                        │  │  │
			
 
				+│  │  │  • words_detect/ - 敏感词检测模块                                       │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  └───────────────────────────────────────────────────────────────────────────────┘  │
			
 
				+│                                        ▲                                            │
			
 
				+│                                        │ 调用基础设施                                │
			
 
				+│                                        ▼                                            │
			
 
				+│  ┌───────────────────────────────────────────────────────────────────────────────┐  │
			
 
				+│  │                        Foundation Layer (基础设施层)                           │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  ai/ - AI模块 ✅ (已实现)                                               │  │  │
			
 
				+│  │  │  ┌─────────────────────────────────────────────────────────────────┐    │  │  │
			
 
				+│  │  │  │  agent/ - AI智能体                                              │    │  │  │
			
 
				+│  │  │  │  • base_agent.py - 智能体基类                                    │    │  │  │
			
 
				+│  │  │  │  • test_agent.py - 测试智能体                                    │    │  │  │
			
 
				+│  │  │  │  • generate/ - 生成模块                                          │    │  │  │
			
 
				+│  │  │  │  • workflow/ - 工作流模块                                        │    │  │  │
			
 
				+│  │  │  └─────────────────────────────────────────────────────────────────┘    │  │  │
			
 
				+│  │  │  ┌─────────────────────────────────────────────────────────────────┐    │  │  │
			
 
				+│  │  │  │  models/ - 模型管理                                              │    │  │  │
			
 
				+│  │  │  │  • model_handler.py - 多模型管理器                               │    │  │  │
			
 
				+│  │  │  │  • rerank_model.py - 重排序模型                                  │    │  │  │
			
 
				+│  │  │  └─────────────────────────────────────────────────────────────────┘    │  │  │
			
 
				+│  │  │  ┌─────────────────────────────────────────────────────────────────┐    │  │  │
			
 
				+│  │  │  │  rag/ - RAG检索增强生成                                         │    │  │  │
			
 
				+│  │  │  │  • retrieval/ - 检索模块                                         │    │  │  │
			
 
				+│  │  │  │  • query_rewrite.py - 查询重写                                  │    │  │  │
			
 
				+│  │  │  │  • entities_enhance.py - 实体增强检索                            │    │  │  │
			
 
				+│  │  │  └─────────────────────────────────────────────────────────────────┘    │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  database/ - 数据库模块 ✅ (已实现)                                    │  │  │
			
 
				+│  │  │  ┌─────────────────────────────────────────────────────────────────┐    │  │  │
			
 
				+│  │  │  │  base/ - 数据库基础层                                            │    │  │  │
			
 
				+│  │  │  │  • kg/ - 知识图谱 (Neo4j)                                       │    │  │  │
			
 
				+│  │  │  │  • sql/ - SQL数据库 (MySQL/PostgreSQL)                          │    │  │  │
			
 
				+│  │  │  │  • vector/ - 向量数据库 (Milvus/PG Vector)                       │    │  │  │
			
 
				+│  │  │  └─────────────────────────────────────────────────────────────────┘    │  │  │
			
 
				+│  │  │  • models/ - 数据模型定义                                             │  │  │
			
 
				+│  │  │  • repositories/ - 数据访问层                                        │  │  │
			
 
				+│  │  │  • migrations/ - 数据库迁移                                           │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  infrastructure/ - 基础设施 ✅ (已实现)                                │  │  │
			
 
				+│  │  │  • cache/ - 缓存管理 (Redis)                                          │  │  │
			
 
				+│  │  │  • config/ - 配置管理 (config_handler)                                │  │  │
			
 
				+│  │  │  • messaging/ - 消息队列 (Celery)                                     │  │  │
			
 
				+│  │  │  • mysql/ - MySQL连接池                                              │  │  │
			
 
				+│  │  │  • tracing/ - 链路追踪                                                │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  observability/ - 可观测性 ✅ (已实现)                                │  │  │
			
 
				+│  │  │  • logger/ - 日志管理                                                 │  │  │
			
 
				+│  │  │  • metrics/ - 指标收集                                                │  │  │
			
 
				+│  │  │  • monitoring/ - 监控 (ai_trace_monitor.py, time_statistics.py)       │  │  │
			
 
				+│  │  │  • monitoring/rag/ - RAG监控 🆕                                       │  │  │
			
 
				+│  │  │  • cachefiles/ - 缓存文件管理 🆕                                      │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  schemas/ - 数据模式 ✅ (已实现)                                       │  │  │
			
 
				+│  │  │  utils/ - 工具函数 ✅ (已实现)                                         │  │  │
			
 
				+│  │  │  • common.py, redis_utils.py, yaml_utils.py, tool_utils.py            │  │  │
			
 
				+│  │  │  • md5.py - MD5工具 🆕                                                │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  └───────────────────────────────────────────────────────────────────────────────┘  │
			
 
				+│                                        ▲                                            │
			
 
				+│                                        │ 数据处理                                    │
			
 
				+│                                        ▼                                            │
			
 
				+│  ┌───────────────────────────────────────────────────────────────────────────────┐  │
			
 
				+│  │                      Data Pipeline Layer (数据管道层)                          │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  document/ - 文档处理 ✅ (部分实现)                                      │  │  │
			
 
				+│  │  │  • parsers/ - 文档解析器 (PDF, DOCX, OCR)                               │  │  │
			
 
				+│  │  │  • processors/ - 数据处理器                                             │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  RAG_recall/ - RAG检索 ✅ (已实现)                                      │  │  │
			
 
				+│  │  │  • rag_miluvs/ - Milvus RAG实现                                        │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  │  ┌─────────────────────────────────────────────────────────────────────────┐  │  │
			
 
				+│  │  │  milvus_inbound_script/ - Milvus入库脚本                                │  │  │
			
 
				+│  │  │  training_data/ - 训练数据处理                                          │  │  │
			
 
				+│  │  └─────────────────────────────────────────────────────────────────────────┘  │  │
			
 
				+│  └───────────────────────────────────────────────────────────────────────────────┘  │
			
 
				+│                                        ▲                                            │
			
 
				+│                                        │ 数据存储                                    │
			
 
				+│                                        ▼                                            │
			
 
				+│  ┌───────────────────────────────────────────────────────────────────────────────┐  │
			
 
				+│  │                    External Services (外部服务)                                │  │
			
 
				+│  │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐           │  │
			
 
				+│  │  │   MySQL     │  │   Milvus    │  │   Redis     │  │   Neo4j     │           │  │
			
 
				+│  │  │  关系数据库   │  │  向量数据库   │  │   缓存      │  │  知识图谱    │           │  │
			
 
				+│  │  └─────────────┘  └─────────────┘  └─────────────┘  └─────────────┘           │  │
			
 
				+│  │  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐                          │  │
			
 
				+│  │  │   AI Models │  │   MinerU    │  │   Langfuse  │                          │  │
			
 
				+│  │  │  本地/云端   │  │   OCR服务   │  │   监控系统   │                          │  │
			
 
				+│  │  └─────────────┘  └─────────────┘  └─────────────┘                          │  │
			
 
				+│  └───────────────────────────────────────────────────────────────────────────────┘  │
			
 
				+│                                                                                     │
			
 
				+└─────────────────────────────────────────────────────────────────────────────────────┘
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 实际目录结构
			
 
				+
			
 
				+```
			
 
				+LQAgentPlatform/
			
 
				+├── server/                          # 服务器层 ✅
			
 
				+│   └── app.py                      # FastAPI应用入口
			
 
				+│
			
 
				+├── views/                           # 视图层 (API接口) ✅
			
 
				+│   ├── __init__.py                 # Lifespan管理（敏感词初始化、DB连接池）
			
 
				+│   ├── construction_review/         # 施工方案审查API ✅
			
 
				+│   │   ├── file_upload.py          # 文档上传接口
			
 
				+│   │   ├── launch_review.py        # 启动审查接口
			
 
				+│   │   ├── review_results.py       # 审查结果接口
			
 
				+│   │   ├── task_control.py         # 任务控制接口
			
 
				+│   │   ├── desensitize_api.py      # 脱敏API接口 🆕
			
 
				+│   │   └── schemas/
			
 
				+│   │       └── error_schemas.py    # 错误模式定义
			
 
				+│   ├── construction_write/          # 施工方案编写API ✅ 🆕
			
 
				+│   │   ├── outline_views.py        # 大纲生成接口
			
 
				+│   │   ├── content_completion.py   # 内容补全接口
			
 
				+│   │   ├── regenerate_views.py     # 重新生成接口
			
 
				+│   │   ├── task_cancel_views.py    # 任务取消接口
			
 
				+│   │   └── similar_plan_recommend.py # 相似方案推荐接口
			
 
				+│   └── test_views.py               # 测试接口
			
 
				+│
			
 
				+├── core/                            # 核心业务层 ✅
			
 
				+│   ├── base/                       # 基础组件 ✅
			
 
				+│   │   ├── progress_manager.py     # 进度管理器
			
 
				+│   │   ├── workflow_manager.py     # 工作流管理器
			
 
				+│   │   ├── sse_manager.py          # SSE推送管理
			
 
				+│   │   ├── redis_duplicate_checker.py  # Redis去重
			
 
				+│   │   ├── task_models.py          # 任务模型
			
 
				+│   │   └── words_detect/           # 敏感词检测
			
 
				+│   │
			
 
				+│   ├── construction_review/        # 施工方案审查模块 ✅
			
 
				+│   │   ├── workflows/              # 工作流层 ✅
			
 
				+│   │   │   ├── ai_review_workflow.py       # AI审查工作流 (核心)
			
 
				+│   │   │   ├── document_workflow.py        # 文档工作流
			
 
				+│   │   │   ├── report_workflow.py          # 报告工作流
			
 
				+│   │   │   ├── core_functions/             # 核心业务函数 🆕
			
 
				+│   │   │   │   └── ai_review_core_fun.py   # AI审查核心逻辑
			
 
				+│   │   │   └── types/                      # 状态类型 🆕
			
 
				+│   │   │       └── __init__.py             # AIReviewState, TaskChainState
			
 
				+│   │   │
			
 
				+│   │   └── component/              # 组件层 ✅
			
 
				+│   │       ├── ai_review_engine.py          # AI审查引擎 (核心)
			
 
				+│   │       ├── document_processor.py        # 文档处理器
			
 
				+│   │       ├── report_generator.py          # 报告生成器
			
 
				+│   │       ├── outline_catalogue_matcher.py # 大纲目录匹配器 🆕
			
 
				+│   │       ├── constants.py                 # 常量定义 🆕
			
 
				+│   │       │
			
 
				+│   │       ├── desensitize/                 # 数据脱敏模块 🆕
			
 
				+│   │       │   ├── engine.py                # 脱敏引擎核心
			
 
				+│   │       │   ├── validator.py             # 黑白名单校验
			
 
				+│   │       │   ├── dict_manager.py          # 脱敏词典管理
			
 
				+│   │       │   ├── model_client.py          # 本地LLM脱敏客户端
			
 
				+│   │       │   ├── remapper.py              # 审查结果反向映射
			
 
				+│   │       │   └── processors/              # 四维度处理器
			
 
				+│   │       │       ├── base_processor.py    # 基础处理器
			
 
				+│   │       │       ├── pii_processor.py     # 个人信息 (姓名/电话/身份证)
			
 
				+│   │       │       ├── geo_processor.py     # 地理位置 (桩号/位置/高程)
			
 
				+│   │       │       ├── biz_processor.py     # 业务信息 (公司/项目名称)
			
 
				+│   │       │       └── financial_processor.py # 财务信息 (金额/价格)
			
 
				+│   │       │
			
 
				+│   │       ├── doc_worker/                  # 文档处理工作器
			
 
				+│   │       │   ├── interfaces.py            # 数据接口/契约
			
 
				+│   │       │   ├── classification/          # 内容分类 🆕
			
 
				+│   │       │   │   ├── chunk_classifier.py  # 分块分类器
			
 
				+│   │       │   │   ├── hierarchy_classifier.py # 层级分类器
			
 
				+│   │       │   │   └── smart_local_classifier.py # 智能本地分类
			
 
				+│   │       │   ├── pdf_worker/              # PDF处理 🆕
			
 
				+│   │       │   │   └── html_to_markdown.py  # HTML转Markdown
			
 
				+│   │       │   ├── config/                  # 配置
			
 
				+│   │       │   │   ├── StandardCategoryTable.csv # 分类标准表
			
 
				+│   │       │   │   ├── prompt.yaml          # LLM提示词
			
 
				+│   │       │   │   └── config.yaml
			
 
				+│   │       │   ├── models/                  # 数据模型
			
 
				+│   │       │   │   ├── document_structure.py # 文档结构模型
			
 
				+│   │       │   │   └── converters.py        # 模型转换器
			
 
				+│   │       │   └── utils/
			
 
				+│   │       │       ├── text_split_support.py # 文本切分支持
			
 
				+│   │       │       └── prompt_loader.py     # 提示词加载
			
 
				+│   │       │
			
 
				+│   │       ├── minimal_pipeline/            # 简化处理管道 🆕
			
 
				+│   │       │   ├── simple_processor.py      # 简化处理器
			
 
				+│   │       │   ├── ocr_processor.py         # OCR处理（解耦模块）
			
 
				+│   │       │   ├── pdf_extractor.py         # PDF提取
			
 
				+│   │       │   ├── chunk_assembler.py       # 分块组装器
			
 
				+│   │       │   ├── catalog_reviewer.py      # 目录审查
			
 
				+│   │       │   ├── toc_builder.py           # 目录构建器
			
 
				+│   │       │   └── toc_detector.py          # 目录检测器
			
 
				+│   │       │
			
 
				+│   │       ├── standard_matching/           # 标准匹配模块 🆕
			
 
				+│   │       │   ├── standard_dao.py          # 标准数据访问
			
 
				+│   │       │   └── standard_service.py      # 标准匹配服务
			
 
				+│   │       │
			
 
				+│   │       ├── infrastructure/              # 基础设施组件
			
 
				+│   │       │   ├── milvus.py                # Milvus向量库客户端
			
 
				+│   │       │   ├── parent_tool.py           # 父块工具
			
 
				+│   │       │   └── relevance.py             # 相关性评分
			
 
				+│   │       │
			
 
				+│   │       ├── check_completeness/          # 完整性检查
			
 
				+│   │       │   └── components/
			
 
				+│   │       │       └── result_analyzer.py   # 结果分析器
			
 
				+│   │       │
			
 
				+│   │       ├── report/                      # 报告生成
			
 
				+│   │       │   └── prompt/
			
 
				+│   │       │       └── report_reviewers.yaml
			
 
				+│   │       │
			
 
				+│   │       └── reviewers/                   # 审查器集合 ✅
			
 
				+│   │           ├── base_reviewer.py         # 基础审查器
			
 
				+│   │           ├── completeness_reviewer.py # 完整性审查器
			
 
				+│   │           ├── reference_basis_reviewer.py # 参考依据审查
			
 
				+│   │           ├── standard_timeliness_reviewer.py # 标准时效性审查 🆕
			
 
				+│   │           ├── timeliness_basis_reviewer.py  # 时效性审查
			
 
				+│   │           ├── timeliness_content_reviewer.py # 时效性内容审查 🆕
			
 
				+│   │           ├── semantic_logic.py        # 语义逻辑审查
			
 
				+│   │           ├── sensitive_word_check.py  # 敏感词检查 🆕
			
 
				+│   │           ├── check_completeness/
			
 
				+│   │           │   └── components/
			
 
				+│   │           │       └── result_analyzer.py
			
 
				+│   │           ├── prompt/                  # 审查器提示词
			
 
				+│   │           │   ├── ai_suggestion.yaml
			
 
				+│   │           │   ├── basic_reviewers.yaml
			
 
				+│   │           │   ├── outline_reviewers.yaml
			
 
				+│   │           │   ├── query_extract.yaml
			
 
				+│   │           │   ├── rag_reviewers.yaml
			
 
				+│   │           │   ├── reference_basis_reviewer.yaml
			
 
				+│   │           │   ├── technical_reviewers.yaml
			
 
				+│   │           │   └── timeliness_basis_reviewer.yaml
			
 
				+│   │           ├── sensitive_words/         # 敏感词词典
			
 
				+│   │           └── utils/                   # 审查工具函数
			
 
				+│   │               ├── ac_automaton.py      # AC自动机
			
 
				+│   │               ├── directory_extraction.py
			
 
				+│   │               ├── llm_content_classifier_v2/ # LLM分类器v2 🆕
			
 
				+│   │               │   ├── main_classifier.py    # 主入口
			
 
				+│   │               │   ├── content_classifier.py # 核心分类逻辑
			
 
				+│   │               │   ├── category_loaders.py   # 分类加载器
			
 
				+│   │               │   ├── chunks_converter.py   # 分块转换
			
 
				+│   │               │   ├── models.py             # 数据模型
			
 
				+│   │               │   ├── prompt.py             # 分类提示词
			
 
				+│   │               │   └── embedding_client.py   # Embedding客户端
			
 
				+│   │               └── llm_chain_client/          # LLM链式客户端框架 🆕
			
 
				+│   │                   ├── bootstrap.py           # 客户端工厂
			
 
				+│   │                   ├── interfaces/            # 接口定义
			
 
				+│   │                   │   ├── chain_executor.py
			
 
				+│   │                   │   ├── llm_client.py
			
 
				+│   │                   │   └── prompt_loader.py
			
 
				+│   │                   ├── implementations/       # 实现
			
 
				+│   │                   │   ├── chains/async_chain_executor.py
			
 
				+│   │                   │   ├── clients/ (base, deepseek, doubao, gemini, qwen)
			
 
				+│   │                   │   └── loaders/yaml_prompt_loader.py
			
 
				+│   │                   └── orchestration/prompt_chain_processor.py
			
 
				+│   │
			
 
				+│   └── construction_write/          # 施工方案编写模块 ✅ 🆕
			
 
				+│       ├── component/
			
 
				+│       │   ├── outline_generator.py # 大纲生成器
			
 
				+│       │   ├── state_models.py      # 状态模型
			
 
				+│       │   └── prompt/
			
 
				+│       │       └── keyword_rules_3.json # 关键字规则
			
 
				+│       └── workflows/
			
 
				+│           ├── agent.py             # 编写智能体
			
 
				+│           └── outline_workflow.py  # 大纲工作流
			
 
				+│
			
 
				+├── foundation/                      # 基础设施层 ✅
			
 
				+│   ├── ai/                         # AI模块 ✅
			
 
				+│   │   ├── agent/                  # AI智能体
			
 
				+│   │   │   └── generate/           # 生成模块
			
 
				+│   │   │       └── model_generate.py
			
 
				+│   │   ├── models/                 # 模型管理 ✅
			
 
				+│   │   │   ├── model_handler.py    # 多模型管理器
			
 
				+│   │   │   ├── model_config_loader.py # 模型配置加载器 🆕
			
 
				+│   │   │   └── rerank_model.py     # 重排序模型
			
 
				+│   │   └── rag/                    # RAG检索增强 ✅
			
 
				+│   │       └── retrieval/          # 检索模块
			
 
				+│   │           ├── query_rewrite.py         # 查询重写
			
 
				+│   │           ├── retrieval.py             # 检索管理器
			
 
				+│   │           └── entities_enhance.py      # 实体增强检索
			
 
				+│   │
			
 
				+│   ├── database/                   # 数据库模块 ✅
			
 
				+│   │   ├── base/                   # 数据库基础层
			
 
				+│   │   │   ├── kg/                 # 知识图谱 (Neo4j)
			
 
				+│   │   │   ├── sql/                # SQL数据库 (MySQL/PostgreSQL)
			
 
				+│   │   │   └── vector/             # 向量数据库
			
 
				+│   │   ├── models/                 # 数据模型定义 ✅
			
 
				+│   │   ├── repositories/           # 数据访问层 ✅
			
 
				+│   │   └── migrations/             # 数据库迁移
			
 
				+│   │
			
 
				+│   ├── infrastructure/             # 基础设施 ✅
			
 
				+│   │   ├── cache/                  # 缓存管理 (Redis)
			
 
				+│   │   ├── config/                 # 配置管理 ✅
			
 
				+│   │   │   └── config.py           # config_handler
			
 
				+│   │   ├── messaging/              # 消息队列 (Celery) ✅
			
 
				+│   │   ├── mysql/                  # MySQL连接池 ✅
			
 
				+│   │   └── tracing/                # 链路追踪 ✅
			
 
				+│   │
			
 
				+│   ├── observability/              # 可观测性 ✅
			
 
				+│   │   ├── logger/                 # 日志管理
			
 
				+│   │   ├── metrics/                # 指标收集
			
 
				+│   │   ├── monitoring/             # 监控 ✅
			
 
				+│   │   │   ├── ai_trace_monitor.py # AI追踪监控
			
 
				+│   │   │   ├── time_statistics.py  # 时间统计
			
 
				+│   │   │   └── rag/                # RAG监控 🆕
			
 
				+│   │   └── cachefiles/             # 缓存文件管理 🆕
			
 
				+│   │
			
 
				+│   ├── schemas/                    # 数据模式 ✅
			
 
				+│   └── utils/                      # 工具函数 ✅
			
 
				+│       ├── common.py
			
 
				+│       ├── redis_utils.py
			
 
				+│       ├── yaml_utils.py
			
 
				+│       ├── tool_utils.py
			
 
				+│       └── md5.py                  # MD5工具 🆕
			
 
				+│
			
 
				+├── data_pipeline/                  # 数据管道层 ✅
			
 
				+│   ├── document/                   # 文档处理
			
 
				+│   │   ├── parsers/               # 文档解析器
			
 
				+│   │   └── processors/            # 数据处理器
			
 
				+│   ├── RAG_recall/                # RAG检索 ✅
			
 
				+│   │   └── rag_miluvs/            # Milvus RAG实现
			
 
				+│   ├── milvus_inbound_script/     # Milvus入库脚本
			
 
				+│   └── training_data/             # 训练数据处理
			
 
				+│
			
 
				+├── config/                         # 配置文件 ✅
			
 
				+│   ├── config.ini                 # 主配置文件 (模型/数据库/Redis)
			
 
				+│   ├── config.ini.template        # 配置模板
			
 
				+│   ├── model_setting.yaml         # 模型设置 🆕
			
 
				+│   ├── prompt/                    # 提示词配置 ✅
			
 
				+│   │   ├── system_prompt.yaml
			
 
				+│   │   └── intent_prompt.yaml
			
 
				+│   ├── sql/                       # SQL脚本
			
 
				+│   └── yolo/                      # YOLO模型 🆕
			
 
				+│       └── best.pt
			
 
				+│
			
 
				+├── utils_test/                    # 测试工具集
			
 
				+│   ├── AI_Review_Test/           # AI审查测试
			
 
				+│   ├── API_key/                  # API密钥生成 🆕
			
 
				+│   ├── Check_Item/               # 审查项测试 🆕
			
 
				+│   ├── Chunk_Split_Test/         # 分块切分测试 🆕
			
 
				+│   ├── Completeness_Enhanced_Test/ # 增强完整性测试 🆕
			
 
				+│   ├── Completeness_Test/         # 完整性测试 🆕
			
 
				+│   ├── Integration_Test/         # 集成测试
			
 
				+│   ├── Milvus_Test/              # Milvus测试
			
 
				+│   ├── MinerU_Test/              # MinerU测试
			
 
				+│   ├── Model_Test/               # 模型测试
			
 
				+│   ├── Other_Test/               # 其他测试 🆕
			
 
				+│   ├── Prompt_Test/              # 提示词测试 🆕
			
 
				+│   ├── RAG_Test/                 # RAG测试 🆕
			
 
				+│   ├── RE_Rrank_Test/            # 重排序测试 🆕
			
 
				+│   ├── Redis/                    # Redis哨兵测试 🆕
			
 
				+│   ├── Redis_Test/               # Redis测试
			
 
				+│   ├── Result_Visual_Observation_Tools/ # 结果可视化 🆕
			
 
				+│   ├── Semantic_Logic_Test/      # 语义逻辑测试
			
 
				+│   ├── Sensitive_Test/           # 敏感词测试
			
 
				+│   ├── standard_new_Test/        # 新标准测试 🆕
			
 
				+│   └── Sync_Funcation_Test/      # 同步函数测试
			
 
				+│
			
 
				+├── docker/                        # Docker配置
			
 
				+├── .design/                       # 设计文档
			
 
				+│   ├── 施工方案编写应用/           # 编写应用设计 🆕
			
 
				+│   └── 施工方案审查应用/           # 审查应用设计
			
 
				+│       ├── base_stage/           # 基础阶段设计
			
 
				+│       ├── Iterative_stage/      # 迭代阶段设计
			
 
				+│       ├── technical_rehearsal/  # 技术预演
			
 
				+│       ├── 基础架构设计/          # 架构设计
			
 
				+│       ├── 施工方案审查API架构设计/ # API架构设计
			
 
				+│       ├── 完整性审查模块/        # 完整性审查设计
			
 
				+│       └── 文档处理模块/          # 文档处理设计
			
 
				+│
			
 
				+├── .RaD/                          # 研发文档 🆕
			
 
				+│
			
 
				+├── logs/                          # 日志目录
			
 
				+├── README.md                      # 项目说明
			
 
				+├── README_deploy.md               # 部署说明 🆕
			
 
				+├── README_test.md                 # 测试说明 🆕
			
 
				+├── requirements.txt               # 依赖清单
			
 
				+├── gunicorn_config.py            # Gunicorn配置
			
 
				+├── run.sh                        # 启动脚本
			
 
				+└── Dockerfile                    # Docker镜像
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 技术栈清单
			
 
				+
			
 
				+### Web框架
			
 
				+- **FastAPI** - 现代化异步Web框架
			
 
				+- **Uvicorn** - ASGI服务器
			
 
				+
			
 
				+### AI框架
			
 
				+- **LangChain** - LLM应用开发框架
			
 
				+- **LangGraph** - 工作流编排引擎
			
 
				+
			
 
				+### 数据库
			
 
				+- **MySQL** - 关系型数据库 (主存储) （保留基础组件，暂未使用）
			
 
				+- **Milvus** - 向量数据库 (RAG检索)
			
 
				+- **PostgreSQL** - 备选关系型数据库（保留基础组件，暂未使用）
			
 
				+- **Neo4j** - 知识图谱数据库（保留基础组件，暂未使用）
			
 
				+
			
 
				+### 缓存/消息队列
			
 
				+- **Redis** - 缓存 + 分布式锁
			
 
				+- **Celery** - 异步任务队列
			
 
				+
			
 
				+### 模型支持
			
 
				+#### 本地模型
			
 
				+- **lq_qwen3_8b** - Qwen3-8B (默认模型)
			
 
				+- **lq_qwen3_4b** - Qwen3-4B
			
 
				+- **lq_qwen3_8b_emd** - Qwen3-8B Embedding
			
 
				+- **lq_rerank_model** - Qwen3-Reranker-8B
			
 
				+
			
 
				+#### 云端模型
			
 
				+- **豆包 (Doubao)** - 字节跳动模型
			
 
				+- **通义千问 (Qwen)** - 阿里云模型
			
 
				+- **DeepSeek** - DeepSeek模型
			
 
				+- **Gemini** - Google模型
			
 
				+- **SiliconFlow** - 硅基流动模型
			
 
				+
			
 
				+### OCR服务
			
 
				+- **MinerU** - 文档OCR识别 (集成中)
			
 
				+
			
 
				+
			
 
				+---
			
 
				+
			
 
				+
			
 
				+
			
 
				+## API接口清单
			
 
				+
			
 
				+### 施工方案审查API
			
 
				+
			
 
				+#### 文档上传
			
 
				+- `POST /construction/upload` - 上传待审查文档
			
 
				+
			
 
				+#### 审查控制
			
 
				+- `POST /sgsc/sse/launch_review` - 启动审查任务（SSE流式）
			
 
				+- `GET /construction/review/results` - 查询审查结果
			
 
				+
			
 
				+#### 任务控制
			
 
				+- `POST /construction/task/terminate` - 终止审查任务
			
 
				+
			
 
				+#### 数据脱敏
			
 
				+- 脱敏API接口 (desensitize_api.py)
			
 
				+
			
 
				+### 施工方案编写API 🆕
			
 
				+
			
 
				+#### 大纲生成
			
 
				+- 大纲生成与工作流接口 (outline_views.py)
			
 
				+
			
 
				+#### 内容编写
			
 
				+- 内容补全接口 (content_completion.py)
			
 
				+- 重新生成接口 (regenerate_views.py)
			
 
				+- 任务取消接口 (task_cancel_views.py)
			
 
				+
			
 
				+#### 方案推荐
			
 
				+- 相似方案推荐接口 (similar_plan_recommend.py)
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 数据流向图
			
 
				+
			
 
				+```
			
 
				+┌──────────────┐
			
 
				+│  用户上传文档  │
			
 
				+└──────┬───────┘
			
 
				+       ▼
			
 
				+┌──────────────────────────────────────────────────────────────┐
			
 
				+│  Views Layer (file_upload.py)                                 │
			
 
				+│  • 接收文件                                                    │
			
 
				+│  • 文件验证                                                    │
			
 
				+│  • 创建任务                                                    │
			
 
				+└──────┬───────────────────────────────────────────────────────┘
			
 
				+       ▼
			
 
				+┌──────────────────────────────────────────────────────────────┐
			
 
				+│  Core Layer - Document Workflow                              │
			
 
				+│  • 文档解析 (PDF/DOCX)                                        │
			
 
				+│  • 结构提取                                                   │
			
 
				+│  • 分块处理                                                   │
			
 
				+└──────┬───────────────────────────────────────────────────────┘
			
 
				+       ▼
			
 
				+┌──────────────────────────────────────────────────────────────┐
			
 
				+│  Database Layer                                              │
			
 
				+│  • Redis - 存储文档元数据                                      │
			
 
				+│  • Milvus - 向量存储                                          │
			
 
				+└──────┬───────────────────────────────────────────────────────┘
			
 
				+       ▼
			
 
				+┌──────────────────────────────────────────────────────────────┐
			
 
				+│  Core Layer - AI Review Workflow                             │
			
 
				+│  • AI审查引擎 (construction_review/ai_review_engine.py)                          │
			
 
				+│  • 多种审查器并行执行                                          │
			
 
				+│  • RAG检索增强                                                │
			
 
				+└──────┬───────────────────────────────────────────────────────┘
			
 
				+       ▼
			
 
				+┌──────────────────────────────────────────────────────────────┐
			
 
				+│  Foundation Layer - AI模块                                    │
			
 
				+│  • 模型调用 (model_handler.py)                               │
			
 
				+│  • RAG检索 (retrieval/)                                      │
			
 
				+│  • 查询重写                                                   │
			
 
				+└──────┬───────────────────────────────────────────────────────┘
			
 
				+       ▼
			
 
				+┌──────────────────────────────────────────────────────────────┐
			
 
				+│  External Services                                           │
			
 
				+│  • AI模型 (本地/云端)                                         │
			
 
				+│  • Milvus向量检索                                            │
			
 
				+│  • Redis缓存                                                 │
			
 
				+└──────┬───────────────────────────────────────────────────────┘
			
 
				+       ▼
			
 
				+┌──────────────────────────────────────────────────────────────┐
			
 
				+│  结果处理                                                     │
			
 
				+│  • 报告生成 (report_generator.py)                            │
			
 
				+│  • SSE推送进度                                                │
			
 
				+│  • 结果存储                                                   │
			
 
				+└──────┬───────────────────────────────────────────────────────┘
			
 
				+       ▼
			
 
				+┌──────────────────┐
			
 
				+│  返回审查结果     │
			
 
				+└──────────────────┘
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 各层职责说明
			
 
				+
			
 
				+### 1. Server Layer (服务器层)
			
 
				+**职责**: FastAPI应用管理、路由配置、Celery任务管理
			
 
				+**实现**:
			
 
				+- `RouteManager` - 路由配置和中间件
			
 
				+- `CeleryWorkerManager` - Celery Worker生命周期管理
			
 
				+- `ApplicationFactory` - 应用工厂模式
			
 
				+
			
 
				+### 2. Views Layer (视图层)
			
 
				+**职责**: HTTP API接口、请求处理、响应格式化
			
 
				+**实现**:
			
 
				+- **construction_review/** - 审查API (上传、审查启动、结果查询、任务控制、脱敏)
			
 
				+- **construction_write/** 🆕 - 编写API (大纲生成、内容补全、重新生成、方案推荐)
			
 
				+
			
 
				+### 3. Core Layer (核心业务层)
			
 
				+**职责**: 业务逻辑编排、应用状态管理、工作流协调
			
 
				+**实现**:
			
 
				+- **construction_review/** - 施工方案审查模块
			
 
				+  - **workflows/** - 基于LangGraph的工作流编排
			
 
				+  - **workflows/core_functions/** 🆕 - AI审查核心业务函数
			
 
				+  - **workflows/types/** 🆕 - 状态类型定义 (AIReviewState, TaskChainState)
			
 
				+  - **component/** - 可复用的业务组件
			
 
				+  - **component/desensitize/** 🆕 - 四维度数据脱敏 (PII/地理/业务/财务)
			
 
				+  - **component/minimal_pipeline/** 🆕 - 简化处理管道 (PDF提取/OCR/目录)
			
 
				+  - **component/standard_matching/** 🆕 - 标准规范匹配服务
			
 
				+  - **component/doc_worker/classification/** 🆕 - 智能内容分类 (分块/层级/本地)
			
 
				+  - **component/reviewers/** - 专门化的审查器集合
			
 
				+  - **component/reviewers/utils/llm_content_classifier_v2/** 🆕 - LLM驱动的内容分类器
			
 
				+  - **component/reviewers/utils/llm_chain_client/** 🆕 - 多厂商LLM链式调用框架
			
 
				+- **construction_write/** 🆕 - 施工方案编写模块
			
 
				+  - **component/** - 大纲生成器、状态模型
			
 
				+  - **workflows/** - 编写智能体、大纲工作流
			
 
				+
			
 
				+### 4. Foundation Layer (基础设施层-办公应用智能体可复用的基础组件)
			
 
				+**职责**: 基础设施、通用组件、外部服务集成
			
 
				+**实现**:
			
 
				+- **ai/** - AI模型管理、RAG检索
			
 
				+- **database/** - 多数据库支持
			
 
				+- **infrastructure/** - 缓存、配置、消息队列
			
 
				+- **observability/** - 日志、监控、指标
			
 
				+
			
 
				+### 5. Data Pipeline Layer (离线数据管道层，不与在线流程集成)
			
 
				+**职责**: 数据处理、格式转换、内容解析
			
 
				+**实现**: 文档解析器、数据处理器、向量入库
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 配置管理结构
			
 
				+
			
 
				+### config.ini 主要配置项
			
 
				+
			
 
				+```ini
			
 
				+见源代码中的config/config.ini
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 系统启动流程详解
			
 
				+
			
 
				+### 1. 应用启动入口 (server/app.py)
			
 
				+
			
 
				+#### 启动类架构
			
 
				+```
			
 
				+ApplicationFactory (应用工厂)
			
 
				+├── create_app()              # 创建FastAPI应用实例
			
 
				+├── create_server_config()    # 创建服务器配置
			
 
				+└── celery_manager           # CeleryWorkerManager实例
			
 
				+
			
 
				+RouteManager (路由管理器)
			
 
				+├── _setup_cors()            # 配置CORS中间件
			
 
				+├── _setup_routes()          # 配置所有路由
			
 
				+├── _setup_exception_handlers() # 全局异常处理
			
 
				+├── _setup_health_checks()   # 健康检查接口
			
 
				+└── _setup_api_docs()        # Swagger API文档
			
 
				+
			
 
				+CeleryWorkerManager (Celery Worker管理器)
			
 
				+├── start_worker()           # 启动Celery Worker（后台线程）
			
 
				+├── stop_worker()            # 优雅停止Worker
			
 
				+├── stop_worker_immediately() # 立即停止Worker
			
 
				+└── _cleanup_redis_tasks()   # 清理Redis任务
			
 
				+
			
 
				+ServerRunner (服务器运行器)
			
 
				+└── run_server()             # 运行Uvicorn服务器
			
 
				+```
			
 
				+
			
 
				+#### 启动流程
			
 
				+```
			
 
				+1. python server/app.py (主入口)
			
 
				+   ↓
			
 
				+2. 创建ApplicationFactory实例
			
 
				+   ↓
			
 
				+3. 初始化RouteManager配置路由
			
 
				+   - 添加CORS中间件
			
 
				+   - 注册所有API路由
			
 
				+   - 配置全局异常处理
			
 
				+   - 设置健康检查接口
			
 
				+   ↓
			
 
				+4. 启动CeleryWorkerManager（后台线程）
			
 
				+   - 清理Redis残留任务
			
 
				+   - 在独立线程中运行celery_app.worker_main(['worker'])
			
 
				+   - 等待2秒确保启动成功
			
 
				+   ↓
			
 
				+5. 配置信号处理器
			
 
				+   - SIGINT (Ctrl+C)
			
 
				+   - SIGTERM (终止信号)
			
 
				+   - Windows控制台事件（Ctrl_CLOSE_EVENT等）
			
 
				+   ↓
			
 
				+6. 启动Uvicorn服务器
			
 
				+   - host: 0.0.0.0 (可配置)
			
 
				+   - port: 8002 (可配置)
			
 
				+   - 加载FastAPI应用
			
 
				+   ↓
			
 
				+7. 服务运行中...
			
 
				+   ↓
			
 
				+8. 收到停止信号时
			
 
				+   - 停止Celery Worker
			
 
				+   - 清理Redis任务
			
 
				+   - 关闭事件循环
			
 
				+```
			
 
				+
			
 
				+#### 关键配置项
			
 
				+```ini
			
 
				+[launch]
			
 
				+HOST = 0.0.0.0              # 监听地址
			
 
				+LAUNCH_PORT = 8002          # 监听端口
			
 
				+
			
 
				+[redis]
			
 
				+REDIS_HOST=127.0.0.1        # Redis主机
			
 
				+REDIS_PORT=6379             # Redis端口
			
 
				+REDIS_DB=0                  # Redis数据库
			
 
				+REDIS_PASSWORD=123456       # Redis密码
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 任务提交流程详解
			
 
				+
			
 
				+### 1. 完整任务流程架构
			
 
				+
			
 
				+```
			
 
				+┌─────────────────────────────────────────────────────────────────────────────────┐
			
 
				+│                           任务提交与执行完整流程                                   │
			
 
				+└─────────────────────────────────────────────────────────────────────────────────┘
			
 
				+
			
 
				+┌─────────────────────────────────────────────────────────────────────────────────┐
			
 
				+│ 步骤1: 文件上传 (views/construction_review/file_upload.py)                      │
			
 
				+│  POST /construction/upload                                                      │
			
 
				+│  • 接收PDF文件上传                                                               │
			
 
				+│  • 生成file_id和callback_task_id                                                │
			
 
				+│  • 保存文件内容到Redis                                                           │
			
 
				+│  • 返回callback_task_id给前端                                                    │
			
 
				+└─────────────────────────────────────────────────────────────────────────────────┘
			
 
				+                                    ↓
			
 
				+┌─────────────────────────────────────────────────────────────────────────────────┐
			
 
				+│ 步骤2: 启动审查 (views/construction_review/launch_review.py)                    │
			
 
				+│  POST /sgsc/sse/launch_review                                                   │
			
 
				+│  • 验证用户权限和参数                                                            │
			
 
				+│  • 建立SSE连接                                                                   │
			
 
				+│  • 调用WorkflowManager.submit_task_processing()                                  │
			
 
				+└─────────────────────────────────────────────────────────────────────────────────┘
			
 
				+                                    ↓
			
 
				+┌─────────────────────────────────────────────────────────────────────────────────┐
			
 
				+│ 步骤3: 提交到Celery (core/base/workflow_manager.py)                             │
			
 
				+│  WorkflowManager.submit_task_processing()                                       │
			
 
				+│  • 使用CeleryTraceManager提交任务                                               │
			
 
				+│  • 自动传递trace_id用于链路追踪                                                  │
			
 
				+│  • 任务进入Redis队列                                                             │
			
 
				+└─────────────────────────────────────────────────────────────────────────────────┘
			
 
				+                                    ↓
			
 
				+┌─────────────────────────────────────────────────────────────────────────────────┐
			
 
				+│ 步骤4: Celery Worker执行 (foundation/infrastructure/messaging/tasks.py)        │
			
 
				+│  submit_task_processing_task()                                                  │
			
 
				+│  • 从队列获取任务                                                                │
			
 
				+│  • 恢复trace_id上下文                                                            │
			
 
				+│  • 调用WorkflowManager.submit_task_processing_sync()                             │
			
 
				+└─────────────────────────────────────────────────────────────────────────────────┘
			
 
				+                                    ↓
			
 
				+┌─────────────────────────────────────────────────────────────────────────────────┐
			
 
				+│ 步骤5: LangGraph任务链执行 (core/base/workflow_manager.py)                      │
			
 
				+│  submit_task_processing_sync()                                                  │
			
 
				+│  • 创建TaskFileInfo对象                                                          │
			
 
				+│  • 创建TaskChainState初始状态                                                    │
			
 
				+│  • 构建LangGraph任务链工作流图                                                   │
			
 
				+│  • 执行ainvoke()运行工作流                                                       │
			
 
				+└─────────────────────────────────────────────────────────────────────────────────┘
			
 
				+                                    ↓
			
 
				+┌─────────────────────────────────────────────────────────────────────────────────┐
			
 
				+│                    LangGraph任务链工作流（方案D）                                │
			
 
				+│  ┌─────────────────────────────────────────────────────────────────────────┐   │
			
 
				+│  │ start → document_processing → ai_review_subgraph → report_generation →  │   │
			
 
				+│  │        complete                                                          │   │
			
 
				+│  └─────────────────────────────────────────────────────────────────────────┘   │
			
 
				+│           每个阶段后检查终止信号：                                              │
			
 
				+│           - terminate: 进入终止节点                                            │
			
 
				+│           - error: 进入错误处理节点                                            │
			
 
				+│           - continue: 继续下一阶段                                             │
			
 
				+└─────────────────────────────────────────────────────────────────────────────────┘
			
 
				+```
			
 
				+
			
 
				+### 2. LangGraph任务链节点详解
			
 
				+
			
 
				+#### 节点1: start (任务链开始)
			
 
				+```python
			
 
				+async def _start_chain_node(state: TaskChainState) -> TaskChainState:
			
 
				+    """
			
 
				+    初始化任务链状态
			
 
				+    - current_stage: "start"
			
 
				+    - overall_task_status: "processing"
			
 
				+    - stage_status: 所有阶段初始化为"pending"
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+#### 节点2: document_processing (文档处理)
			
 
				+```python
			
 
				+async def _document_processing_node(state: TaskChainState) -> TaskChainState:
			
 
				+    """
			
 
				+    执行文档处理工作流
			
 
				+    - 检查终止信号
			
 
				+    - 创建DocumentWorkflow实例
			
 
				+    - 调用document_workflow.execute()
			
 
				+    - 返回结构化内容
			
 
				+    - stage_status["document"]: "completed"/"terminated"/"failed"
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+#### 节点3: ai_review_subgraph (AI审查)
			
 
				+```python
			
 
				+async def _ai_review_subgraph_node(state: TaskChainState) -> TaskChainState:
			
 
				+    """
			
 
				+    执行AI审查工作流（嵌套子图）
			
 
				+    - 检查终止信号
			
 
				+    - 获取文档处理结果中的structured_content
			
 
				+    - 创建AIReviewWorkflow实例
			
 
				+    - 调用ai_workflow.execute()
			
 
				+    - 返回审查结果
			
 
				+    - stage_status["ai_review"]: "completed"/"terminated"/"failed"
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+#### 节点4: report_generation (报告生成)
			
 
				+```python
			
 
				+async def _report_generation_node(state: TaskChainState) -> TaskChainState:
			
 
				+    """
			
 
				+    生成审查报告
			
 
				+    - 检查终止信号
			
 
				+    - 获取AI审查结果
			
 
				+    - 创建ReportWorkflow实例
			
 
				+    - 调用report_workflow.execute()
			
 
				+    - 保存完整结果到文件
			
 
				+    - stage_status["report"]: "completed"/"terminated"/"failed"
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+#### 节点5: complete (任务完成)
			
 
				+```python
			
 
				+async def _complete_chain_node(state: TaskChainState) -> TaskChainState:
			
 
				+    """
			
 
				+    标记整体任务完成
			
 
				+    - overall_task_status: "completed" ⚠️ 只有到这里才标记完成
			
 
				+    - 清理Redis文件缓存
			
 
				+    - 通知SSE连接
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+#### 节点6: error_handler (错误处理)
			
 
				+```python
			
 
				+async def _error_handler_chain_node(state: TaskChainState) -> TaskChainState:
			
 
				+    """
			
 
				+    处理任务链错误
			
 
				+    - overall_task_status: "failed"
			
 
				+    - 清理Redis文件缓存
			
 
				+    - 通知SSE连接失败状态
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+#### 节点7: terminate (任务终止)
			
 
				+```python
			
 
				+async def _terminate_chain_node(state: TaskChainState) -> TaskChainState:
			
 
				+    """
			
 
				+    处理任务终止
			
 
				+    - overall_task_status: "terminated"
			
 
				+    - 清理Redis终止信号
			
 
				+    - 清理Redis文件缓存
			
 
				+    - 通知SSE连接终止状态
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+### 3. 条件边判断逻辑
			
 
				+
			
 
				+```python
			
 
				+def _should_terminate_or_error_chain(state: TaskChainState) -> str:
			
 
				+    """
			
 
				+    决定工作流下一步走向
			
 
				+    优先级：terminate > error > continue
			
 
				+    """
			
 
				+    # 1. 优先检查终止信号
			
 
				+    if state.get("overall_task_status") == "terminated":
			
 
				+        return "terminate"
			
 
				+
			
 
				+    # 2. 检查错误状态
			
 
				+    if state.get("overall_task_status") == "failed" or state.get("error_message"):
			
 
				+        return "error"
			
 
				+
			
 
				+    # 3. 默认继续执行
			
 
				+    return "continue"
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 审查逻辑详解
			
 
				+
			
 
				+### 1. AI审查引擎架构 (core/construction_review/component/construction_review/ai_review_engine.py)
			
 
				+
			
 
				+```
			
 
				+AIReviewEngine (审查引擎核心)
			
 
				+├── 基础合规性检查
			
 
				+│   ├── check_grammar()              # 词句语法检查
			
 
				+│   ├── check_semantic_logic()       # 语义逻辑检查
			
 
				+│   ├── check_completeness()         # 完整性检查
			
 
				+│   ├── check_timeliness()           # 时效性检查
			
 
				+│   └── check_reference()            # 规范性检查
			
 
				+│
			
 
				+├── 技术性合规检查
			
 
				+│   ├── check_non_parameter_compliance()  # 非参数合规性检查
			
 
				+│   └── check_parameter_compliance()      # 参数合规性检查
			
 
				+│
			
 
				+├── RAG增强审查
			
 
				+│   ├── vector_search()              # 向量检索
			
 
				+│   ├── hybrid_search()              # 混合检索
			
 
				+│   ├── rerank_results()             # 重排序结果
			
 
				+│   └── generate_enhanced_suggestions() # 生成增强建议
			
 
				+│
			
 
				+└── 专业性审查
			
 
				+    ├── prep_basis_review()          # 编制依据审查
			
 
				+    ├── outline_review_results_df()  # 目录审查
			
 
				+    └── catalogues_check()           # 章节审查
			
 
				+```
			
 
				+
			
 
				+### 2. 审查配置与模式
			
 
				+
			
 
				+#### 审查配置参数
			
 
				+```python
			
 
				+# 方式1: review_config (审查维度枚举值)
			
 
				+review_config = [
			
 
				+    'sensitive_word_check',       # 词句语法检查
			
 
				+    'semantic_logic_check',       # 语义逻辑审查
			
 
				+    'completeness_check',         # 条文完整性审查
			
 
				+    'timeliness_check',           # 时效性审查
			
 
				+    'reference_check',            # 规范性审查
			
 
				+    'sensitive_check',            # 敏感词审查
			
 
				+    'non_parameter_compliance_check',  # 非参数合规性检查
			
 
				+    'parameter_compliance_check', # 参数合规性检查
			
 
				+]
			
 
				+
			
 
				+# 方式2: review_item_config (章节_审查维度格式)
			
 
				+review_item_config = [
			
 
				+    'basis_sensitive_word_check',     # 编制依据章节-词句语法检查
			
 
				+    'plan_semantic_logic_check',      # 施工计划章节-语义逻辑审查
			
 
				+    'catalogue_completeness_check',   # 目录章节-完整性检查（特殊规则）
			
 
				+]
			
 
				+```
			
 
				+
			
 
				+#### 审查模式配置
			
 
				+```ini
			
 
				+[ai_review]
			
 
				+MAX_REVIEW_UNITS=5          # 最大审查单元数量（0=全部审查）
			
 
				+REVIEW_MODE=all             # 审查模式: all/random/first
			
 
				+```
			
 
				+
			
 
				+#### 工程方案类型
			
 
				+```python
			
 
				+supported_types = {
			
 
				+    '01_pf_Found_Rotary_Drill',  # 旋挖钻机、冲击钻机成孔桩
			
 
				+    '02_pf_Dig_Manual_Pile',     # 人工挖孔桩
			
 
				+    '03_bd_Sub_Cyl_Pier',        # 圆柱墩、系梁、盖梁
			
 
				+    # ... 共13种工程方案类型
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### 3. AI审查工作流 (core/construction_review/component/workflows/ai_review_workflow.py)
			
 
				+
			
 
				+```
			
 
				+AIReviewWorkflow (基于LangGraph的审查工作流)
			
 
				+├── 构建审查子图
			
 
				+│   ├── 添加审查节点（每种审查类型一个节点）
			
 
				+│   ├── 设置节点间转换关系
			
 
				+│   └── 编译为可执行图
			
 
				+│
			
 
				+├── 审查节点类型
			
 
				+│   ├── 单元级审查节点
			
 
				+│   │   ├── 文档分块
			
 
				+│   │   ├── 并发执行各类审查器
			
 
				+│   │   ├── 汇总审查结果
			
 
				+│   │   └── 计算风险等级
			
 
				+│   │
			
 
				+│   └── 章节级审查节点
			
 
				+│       ├── 提取章节内容
			
 
				+│       ├── 执行章节特定审查
			
 
				+│       └── 生成章节审查结果
			
 
				+│
			
 
				+└── 审查器集合 (reviewers/)
			
 
				+    ├── base_reviewer.py         # 基础审查器
			
 
				+    ├── check_completeness/      # 完整性检查组件
			
 
				+    ├── catalogues_check/        # 目录审查组件
			
 
				+    └── utils/                   # 审查工具函数
			
 
				+```
			
 
				+
			
 
				+### 4. RAG检索增强流程
			
 
				+
			
 
				+```
			
 
				+RAG检索增强
			
 
				+│
			
 
				+├── 1. 查询重写 (query_rewrite_manager)
			
 
				+│   └── 优化原始查询以提高检索质量
			
 
				+│
			
 
				+├── 2. 实体增强 (entity_enhance)
			
 
				+│   └── 识别文档实体并增强查询
			
 
				+│
			
 
				+├── 3. 向量检索 (MilvusManager)
			
 
				+│   ├── vector_search()         # 纯向量检索
			
 
				+│   └── hybrid_search()         # 混合检索（向量+关键词）
			
 
				+│
			
 
				+├── 4. 父块召回 (enhance_with_parent_docs_grouped)
			
 
				+│   └── 返回父块上下文信息
			
 
				+│
			
 
				+└── 5. 重排序 (rerank_results)
			
 
				+    └── 使用Rerank模型优化结果排序
			
 
				+```
			
 
				+
			
 
				+### 5. 审查结果数据结构
			
 
				+
			
 
				+```python
			
 
				+{
			
 
				+    "callback_task_id": "file_id-timestamp",
			
 
				+    "file_id": "original_file_id",
			
 
				+    "file_name": "document.pdf",
			
 
				+    "user_id": "user-001",
			
 
				+    "overall_task_status": "completed",  # processing/completed/failed/terminated
			
 
				+    "stage_status": {
			
 
				+        "document": "completed",
			
 
				+        "ai_review": "completed",
			
 
				+        "report": "completed"
			
 
				+    },
			
 
				+    "document_result": {
			
 
				+        "structured_content": {...},
			
 
				+        "parsed_sections": [...]
			
 
				+    },
			
 
				+    "ai_review_result": {
			
 
				+        "review_results": [
			
 
				+            {
			
 
				+                "unit_index": 0,
			
 
				+                "unit_content": {...},
			
 
				+                "review_items": [
			
 
				+                    {
			
 
				+                        "check_type": "semantic_logic_check",
			
 
				+                        "risk_level": "high",
			
 
				+                        "issues": [...],
			
 
				+                        "suggestions": [...]
			
 
				+                    }
			
 
				+                ]
			
 
				+            }
			
 
				+        ],
			
 
				+        "summary": {...}
			
 
				+    },
			
 
				+    "report_result": {
			
 
				+        "report_path": "/path/to/report.json",
			
 
				+        "summary": "审查报告摘要"
			
 
				+    },
			
 
				+    "timestamp": "2026-02-03T12:00:00"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## SSE实时推送机制
			
 
				+
			
 
				+### SSE连接管理 (core/base/sse_manager.py)
			
 
				+
			
 
				+```
			
 
				+unified_sse_manager (统一SSE管理器)
			
 
				+├── establish_connection()     # 建立SSE连接并注册回调
			
 
				+├── send_progress()            # 发送进度更新
			
 
				+├── close_connection()         # 关闭SSE连接
			
 
				+└── connection_registry        # 连接注册表 {callback_task_id: queue}
			
 
				+```
			
 
				+
			
 
				+### SSE事件类型
			
 
				+
			
 
				+```python
			
 
				+# 连接事件
			
 
				+"connected"        # SSE连接已建立
			
 
				+"connection_closed" # SSE连接已关闭
			
 
				+
			
 
				+# 进度事件
			
 
				+"processing"       # 处理中（通用进度更新）
			
 
				+"unit_review_update" # 单元审查更新
			
 
				+"processing_flag"  # 处理标志
			
 
				+
			
 
				+# 完成事件
			
 
				+"submitted"        # 任务已提交
			
 
				+"completed"        # 任务已完成
			
 
				+
			
 
				+# 错误事件
			
 
				+"error"            # 发生错误
			
 
				+```
			
 
				+
			
 
				+### 进度推送流程
			
 
				+
			
 
				+```
			
 
				+1. 前端建立SSE连接
			
 
				+   POST /sgsc/sse/launch_review
			
 
				+   ↓
			
 
				+2. 后端建立SSE连接并注册回调
			
 
				+   unified_sse_manager.establish_connection(callback_task_id, sse_progress_callback)
			
 
				+   ↓
			
 
				+3. 返回"connected"事件
			
 
				+   yield format_sse_event("connected", connected_data)
			
 
				+   ↓
			
 
				+4. 提交任务到Celery
			
 
				+   workflow_manager.submit_task_processing(file_info)
			
 
				+   ↓
			
 
				+5. Celery Worker执行任务
			
 
				+   ↓
			
 
				+6. 各阶段更新进度
			
 
				+   progress_manager.update_progress(callback_task_id, stage_data)
			
 
				+   ↓
			
 
				+7. SSE回调被触发
			
 
				+   sse_progress_callback(callback_task_id, current_data)
			
 
				+   ↓
			
 
				+8. 发送SSE事件到前端
			
 
				+   unified_sse_manager.send_progress(callback_task_id, current_data)
			
 
				+   ↓
			
 
				+9. 前端接收SSE事件并更新UI
			
 
				+   ↓
			
 
				+10. 任务完成，发送"completed"事件
			
 
				+    yield format_sse_event("completed", completion_data)
			
 
				+    ↓
			
 
				+11. 关闭SSE连接
			
 
				+    unified_sse_manager.close_connection(callback_task_id)
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 任务终止机制
			
 
				+
			
 
				+### 终止信号设置
			
 
				+
			
 
				+```python
			
 
				+async def set_terminate_signal(callback_task_id: str, operator: str) -> Dict:
			
 
				+    """
			
 
				+    设置任务终止信号
			
 
				+    - 写入Redis: ai_review:terminate_signal:{callback_task_id}
			
 
				+    - 存储操作人和终止时间
			
 
				+    - 设置2小时过期时间
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+### 终止信号检测
			
 
				+
			
 
				+```python
			
 
				+async def check_terminate_signal(callback_task_id: str) -> bool:
			
 
				+    """
			
 
				+    检查是否有终止信号
			
 
				+    - 从Redis读取终止信号
			
 
				+    - 每个工作流节点执行前调用
			
 
				+    - 检测到信号后进入终止流程
			
 
				+    """
			
 
				+```
			
 
				+
			
 
				+### 终止流程
			
 
				+
			
 
				+```
			
 
				+1. 用户调用终止接口
			
 
				+   POST /construction/task/terminate
			
 
				+   ↓
			
 
				+2. 设置Redis终止信号
			
 
				+   set_terminate_signal(callback_task_id, operator)
			
 
				+   ↓
			
 
				+3. 工作流节点检测到信号
			
 
				+   check_terminate_signal() returns True
			
 
				+   ↓
			
 
				+4. 条件边判断返回"terminate"
			
 
				+   _should_terminate_or_error_chain() returns "terminate"
			
 
				+   ↓
			
 
				+5. 进入terminate节点
			
 
				+   _terminate_chain_node()
			
 
				+   ↓
			
 
				+6. 清理资源
			
 
				+   - 清理Redis终止信号
			
 
				+   - 清理Redis文件缓存
			
 
				+   - 通知SSE连接
			
 
				+   ↓
			
 
				+7. 返回终止状态
			
 
				+   overall_task_status: "terminated"
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 部署架构
			
 
				+
			
 
				+### 开发环境
			
 
				+```
			
 
				+启动命令: python server/app.py
			
 
				+
			
 
				+启动流程:
			
 
				+1. 加载配置文件 (config/config.ini)
			
 
				+2. 初始化Redis连接
			
 
				+3. 启动Celery Worker（后台线程）
			
 
				+4. 启动Uvicorn服务器（端口8002）
			
 
				+5. 注册信号处理器（优雅关闭）
			
 
				+
			
 
				+依赖服务:
			
 
				+- Redis (localhost:6379)
			
 
				+- Milvus (向量数据库)
			
 
				+- 本地AI模型服务 (192.168.91.253:9002)
			
 
				+```
			
 
				+
			
 
				+### 生产环境
			
 
				+```
			
 
				+部署方式: Docker + Docker Compose
			
 
				+
			
 
				+启动命令: docker-compose up -d
			
 
				+
			
 
				+服务组件:
			
 
				+- web: FastAPI应用 (Uvicorn)
			
 
				+- celery: Celery Worker
			
 
				+- redis: Redis缓存
			
 
				+- milvus: Milvus向量数据库
			
 
				+```
			
 
				+
			
 
				+### 容器化部署
			
 
				+```
			
 
				+docker-compose up -d
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## v2.0 → v3.0 变更记录 (2026-04-09)
			
 
				+
			
 
				+### 新增模块
			
 
				+
			
 
				+| 模块 | 路径 | 说明 |
			
 
				+|------|------|------|
			
 
				+| 施工方案编写 | `core/construction_write/` | 大纲生成、内容编写工作流 |
			
 
				+| 编写API | `views/construction_write/` | 大纲/补全/重生成/取消/推荐接口 |
			
 
				+| 数据脱敏 | `component/desensitize/` | 四维度脱敏（PII/地理/业务/财务）+ 反向映射 |
			
 
				+| 简化管道 | `component/minimal_pipeline/` | PDF提取、OCR（解耦）、目录构建/检测 |
			
 
				+| 标准匹配 | `component/standard_matching/` | 标准规范DAO与服务层 |
			
 
				+| 智能分类 | `doc_worker/classification/` | 分块/层级/本地三种分类策略 |
			
 
				+| LLM分类器v2 | `reviewers/utils/llm_content_classifier_v2/` | LLM驱动的内容分类，含补充验证机制 |
			
 
				+| LLM链式客户端 | `reviewers/utils/llm_chain_client/` | 多厂商（DeepSeek/豆包/Gemini/Qwen）链式调用框架 |
			
 
				+| 核心函数 | `workflows/core_functions/` | AI审查核心业务逻辑抽离 |
			
 
				+| 状态类型 | `workflows/types/` | AIReviewState、TaskChainState 定义 |
			
 
				+
			
 
				+### 新增审查器
			
 
				+
			
 
				+| 审查器 | 说明 |
			
 
				+|--------|------|
			
 
				+| `standard_timeliness_reviewer.py` | 标准时效性审查 |
			
 
				+| `timeliness_content_reviewer.py` | 时效性内容审查 |
			
 
				+| `sensitive_word_check.py` | 敏感词检查 |
			
 
				+
			
 
				+### 新增基础设施
			
 
				+
			
 
				+| 组件 | 说明 |
			
 
				+|------|------|
			
 
				+| `foundation/observability/cachefiles/` | 缓存文件管理 |
			
 
				+| `foundation/observability/monitoring/rag/` | RAG管道监控 |
			
 
				+| `foundation/ai/models/model_config_loader.py` | 模型配置加载器 |
			
 
				+| `foundation/utils/md5.py` | MD5工具 |
			
 
				+| `config/model_setting.yaml` | 模型设置配置 |
			
 
				+| `config/yolo/` | YOLO模型权重 |
			
 
				+
			
 
				+### 架构变更要点
			
 
				+
			
 
				+1. **审查引擎解耦**: `ai_review_engine.py` 核心逻辑部分抽离至 `core_functions/ai_review_core_fun.py`
			
 
				+2. **OCR模块独立**: `minimal_pipeline/ocr_processor.py` 从PDF提取中解耦为独立模块
			
 
				+3. **分类体系升级**: 从简单分类升级为 `llm_content_classifier_v2`，支持 keywords 关键字扫描 + LLM补充验证
			
 
				+4. **多模型支持**: `llm_chain_client` 框架统一 DeepSeek/豆包/Gemini/Qwen 多厂商调用
			
 
				+5. **脱敏-审查闭环**: `desensitize/` 模块提供正向脱敏 + `remapper.py` 反向映射，保证审查结果可还原
			
 
				+
			
 
				+---
			
--- a/config/config.ini.template
+++ b/config/config.ini.template
@@ -50,11 +50,11 @@ HOST = 0.0.0.0
 
				 LAUNCH_PORT = 8002
			
 
				 
			
 
				 [redis]
			
 
				-REDIS_URL=redis://:123456@127.0.0.1:6379
			
 
				-REDIS_HOST=127.0.0.1
			
 
				+REDIS_URL=redis://:Wxcz666@@lqRedis_dev:6379
			
 
				+REDIS_HOST=lqRedis_dev
			
 
				 REDIS_PORT=6379
			
 
				 REDIS_DB=0
			
 
				-REDIS_PASSWORD=123456
			
 
				+REDIS_PASSWORD=Wxcz666@
			
 
				 REDIS_MAX_CONNECTIONS=50
			
 
				 
			
 
				 [ocr]
			
--- a/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
+++ b/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
@@ -20,10 +20,10 @@ first_seq,first_code,first_name,second_seq,second_code,second_name,second_focus,
 
				 2,overview,工程概况,5,RequirementsTech,施工要求和技术保证条件,名称类、日期类。名称类、量化单位类、数值类。,2,QualityTarget,质量目标,质量目标、合同条款编号、业主具体要求（如绿色施工认证）。,质量目标;合格率;质量标准;鲁班奖;优质工程;质量等级,
			
 
				 2,overview,工程概况,5,RequirementsTech,施工要求和技术保证条件,名称类、日期类。名称类、量化单位类、数值类。,3,SecurityGoals,安全目标,安全目标（如零死亡事故、隐患整改率）、合同条款编号、业主具体要求（如绿色施工认证）。,安全目标;零伤亡;安全事故;安全指标;安全生产目标,
			
 
				 2,overview,工程概况,5,RequirementsTech,施工要求和技术保证条件,名称类、日期类。名称类、量化单位类、数值类。,4,EnvironmentalGoals,环境目标,环境目标（如扬尘控制、噪声限值）或业主具体要求（如绿色施工认证）。,环境目标;扬尘控制;噪声限值;绿色施工指标;文明施工目标,
			
 
				-2,overview,工程概况,6,RiskLevel,风险辨识与分级,危害隐患性词汇类、法规名称类、标准编号类。风险等级相关专业性词汇、属于、标准编号或其它编号、部门名称类、数值类、量化单位类。名称类、数值类。,1,DangerSource,危险源,"第一优先级（引用识别）： 若文本中出现如“见表XX”、“见附件XX”、“相关表格放置于第十章（或某章）”等明确指向外部表格或附件的表述，直接视为满足当前审查要求。
			
 
				-第二优先级（要素审查）： 若文本中没有指向外部的引用，请审查正文是否同时包含了以下核心要素：列出具体的危险源。",危险源;风险源;危害因素;安全隐患;事故隐患;危险因素;风险点,
			
 
				-2,overview,工程概况,6,RiskLevel,风险辨识与分级,危害隐患性词汇类、法规名称类、标准编号类。风险等级相关专业性词汇、属于、标准编号或其它编号、部门名称类、数值类、量化单位类。名称类、数值类。,2,ClassificationAndResponseMeasures,分级与应对措施,"第一优先级（引用识别）： 若文本中出现如“见表XX”、“见附件XX”、“相关表格放置于第十章（或某章）”等明确指向外部表格或附件的表述，直接视为满足当前审查要求。
			
 
				-第二优先级（要素审查）： 若文本中没有指向外部的引用，请审查正文是否同时包含了以下核心要素：① 对危险源进行分级；；② 明确对应的应对措施。",风险等级;重大风险;较大风险;一般风险;应对措施;LEC;风险分级;风险评估,
			
 
				+2,overview,工程概况,6,RiskLevel,风险辨识与分级,危害隐患性词汇类、法规名称类、标准编号类。风险等级相关专业性词汇、属于、标准编号或其它编号、部门名称类、数值类、量化单位类。名称类、数值类。,1,DangerSource,危险源,"第一优先级（引用识别）： 若文本中出现如“见表XX”、“见附件XX”、“相关表格放置于第十章（或某章）”等明确指向外部表格或附件的表述，直接视为满足当前审查要求，需同时将其分类为危险源。
			
 
				+第二优先级（要素审查）： 若文本中没有指向外部的引用，请审查正文是否同时包含了以下核心要素：列出具体的危险源。",详见;风险辨识与分级;危险源;风险源;危害因素;安全隐患;事故隐患;危险因素;风险点,
			
 
				+2,overview,工程概况,6,RiskLevel,风险辨识与分级,危害隐患性词汇类、法规名称类、标准编号类。风险等级相关专业性词汇、属于、标准编号或其它编号、部门名称类、数值类、量化单位类。名称类、数值类。,2,ClassificationAndResponseMeasures,分级与应对措施,"第一优先级（引用识别）： 若文本中出现如“见表XX”、“见附件XX”、“相关表格放置于第十章（或某章）”等明确指向外部表格或附件的表述，直接视为满足当前审查要求。需同时将其分类为分级与应对措施
			
 
				+第二优先级（要素审查）： 若文本中没有指向外部的引用，请审查正文是否同时包含了以下核心要素：① 对危险源进行分级；；② 明确对应的应对措施。",详见;风险辨识与分级;风险等级;重大风险;较大风险;一般风险;应对措施;LEC;风险分级;风险评估,
			
 
				 2,overview,工程概况,7,Stakeholders,参建各方责任主体单位,名称类、数值类。,1,UnitType,单位类型,"参建各方责任主体单位主要描述该项目的建设单位、设计单位、监理单位、施
			
 
				 工单位、监控单位、专业分包单位的名称。",建设单位;设计单位;监理单位;施工单位;参建单位;总承包;社会信用代码,
			
 
				 3,plan,施工计划,1,Schedule,施工进度计划,关键工程节点安排、施工进度计划横道图、进度控制点、里程碑事件、工序搭接关系、工期延误风险、进度调整机制、施工流水节拍、网络计划技术（如双代号网络图）,1,KeyProjectNodeArrangement,关键工程（工序）节点安排,主要工程（工序）节点的起止时间和持续时间、聚焦影响总工期的关键工序（如基础浇筑、主体封顶）、是进度控制的核心；,关键节点;里程碑;关键工序;主要节点;节点工期;关键线路,
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor.py
@@ -2,35 +2,89 @@
 
				 PDF 结构提取器 - 同步并发 OCR 版本
			
 
				 
			
 
				 基于 splitter_pdf 逻辑，直接提取章节结构并记录页码。
			
 
				-支持 OCR 增强：检测表格区域并使用 ThreadPoolExecutor 5并发 OCR，其他文本保持 PyMuPDF 提取。
			
 
				+支持 OCR 增强：表格检测和识别委托给 OcrProcessor，其他文本保持 PyMuPDF 提取。
			
 
				 输出格式兼容后续分类与组装流程。
			
 
				 """
			
 
				 
			
 
				 import re
			
 
				-from typing import Dict, Any, List, Optional, Tuple
			
 
				+from dataclasses import dataclass
			
 
				+from typing import Dict, Any, List, Optional, Tuple, Set
			
 
				 
			
 
				 import fitz
			
 
				 
			
 
				 from foundation.observability.logger.loggering import review_logger as logger
			
 
				+from .ocr_processor import OcrProcessor
			
 
				 
			
 
				-from .ocr_processor import OcrProcessor, TableRegion, OcrResult
			
 
				 
			
 
				-# 尝试导入 RapidLayout
			
 
				-try:
			
 
				-    from rapid_layout import RapidLayout
			
 
				-    RAPID_LAYOUT_AVAILABLE = True
			
 
				-except ImportError:
			
 
				-    RAPID_LAYOUT_AVAILABLE = False
			
 
				-    RapidLayout = None
			
 
				+@dataclass
			
 
				+class TableRegion:
			
 
				+    """表格区域信息"""
			
 
				+    page_num: int
			
 
				+    page: fitz.Page
			
 
				+    bbox: Tuple[float, float, float, float]
			
 
				+    score: float
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class OcrResult:
			
 
				+    """OCR 结果"""
			
 
				+    page_num: int
			
 
				+    bbox: Tuple[float, float, float, float]
			
 
				+    score: float
			
 
				+    text: str
			
 
				+    success: bool
			
 
				 
			
 
				 
			
 
				 class PdfStructureExtractor:
			
 
				     """PDF 章节结构提取器（支持 OCR 异步并发）"""
			
 
				 
			
 
				-    CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
			
 
				-    SECTION_PATTERN = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
			
 
				+    RULE_LIB = {
			
 
				+        "Rule_1_纯数字派": {
			
 
				+            "l1": re.compile(r"^\d{1,2}(?:[\.．。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].*"),
			
 
				+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				+        },
			
 
				+        "Rule_2_混合章派": {
			
 
				+            "l1": re.compile(r"^第\s*(\d+)\s*[章部分篇][\s、]*(.*)"),
			
 
				+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				+        },
			
 
				+        "Rule_3_中英混血派": {
			
 
				+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				+            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				+        },
			
 
				+        "Rule_4_传统公文派": {
			
 
				+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				+            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[、\s]+([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				+        },
			
 
				+        "Rule_5_单边括号派": {
			
 
				+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				+            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[）\)\]][\s]*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				+        },
			
 
				+        "Rule_6_小节派": {
			
 
				+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				+            "l2": re.compile(r"^第\s*([一二三四五六七八九十百零两]+)\s*节[\s、]*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				+        },
			
 
				+        "Rule_7_粗体括号派": {
			
 
				+            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				+            "l2": re.compile(r"^[【\[]\s*(\d+)\s*[\]】][\s]*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				+        },
			
 
				+        "Rule_8_cn_list_l1_numeric_l2": {
			
 
				+            "l1": re.compile(
			
 
				+                r"^(?:[一二三四五六七八九十百零两]+)[、\)\]\uFF09]\s*[\u4e00-\u9fa5A-Za-z].*"
			
 
				+            ),
			
 
				+            "l2": re.compile(
			
 
				+                r"^\d{1,2}(?:[、\.\uFF0E\u3002\)\]\uFF09])\s*(?!\d)[\u4e00-\u9fa5A-Za-z].*"
			
 
				+            ),
			
 
				+        },
			
 
				+    }
			
 
				     TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
			
 
				 
			
 
				+    # OCR 配置
			
 
				+    MAX_SHORT_EDGE = 1024
			
 
				+    JPEG_QUALITY = 90
			
 
				+    OCR_DPI = 200
			
 
				+    OCR_CONFIDENCE_THRESHOLD = 0.5
			
 
				+    OCR_CONCURRENT_WORKERS = 5
			
 
				+
			
 
				     def __init__(
			
 
				         self,
			
 
				         clip_top: float = 60,
			
@@ -44,22 +98,37 @@ class PdfStructureExtractor:
 
				     ):
			
 
				         self.clip_top = clip_top
			
 
				         self.clip_bottom = clip_bottom
			
 
				-        self.use_ocr = use_ocr and RAPID_LAYOUT_AVAILABLE
			
 
				 
			
 
				-        # 初始化 OCR 处理器
			
 
				-        self._ocr_processor = OcrProcessor(
			
 
				-            ocr_api_url=ocr_api_url,
			
 
				-            ocr_timeout=ocr_timeout,
			
 
				-            ocr_api_key=ocr_api_key,
			
 
				-        ) if self.use_ocr else None
			
 
				+        # OCR 配置
			
 
				+        self.ocr_api_url = ocr_api_url
			
 
				+        self.ocr_timeout = ocr_timeout
			
 
				+        self.ocr_api_key = ocr_api_key
			
 
				+        self.ocr_processor: Optional[OcrProcessor] = None
			
 
				+        self.use_ocr = False
			
 
				+        if use_ocr:
			
 
				+            self.ocr_processor = OcrProcessor(
			
 
				+                ocr_api_url=ocr_api_url,
			
 
				+                ocr_timeout=ocr_timeout,
			
 
				+                ocr_api_key=ocr_api_key,
			
 
				+                max_short_edge=self.MAX_SHORT_EDGE,
			
 
				+                jpeg_quality=self.JPEG_QUALITY,
			
 
				+                ocr_dpi=self.OCR_DPI,
			
 
				+                confidence_threshold=self.OCR_CONFIDENCE_THRESHOLD,
			
 
				+                concurrent_workers=self.OCR_CONCURRENT_WORKERS,
			
 
				+            )
			
 
				+            self.use_ocr = self.ocr_processor.is_available()
			
 
				+        self._layout_engine: Optional[Any] = None
			
 
				 
			
 
				         # 目录检测配置
			
 
				         self.detect_toc = detect_toc
			
 
				         self.toc_model_path = toc_model_path
			
 
				         self._toc_extractor = None
			
 
				 
			
 
				-        if use_ocr and not RAPID_LAYOUT_AVAILABLE:
			
 
				-            logger.warning("RapidLayout 未安装，OCR 功能不可用")
			
 
				+    def _get_layout_engine(self) -> Optional[Any]:
			
 
				+        """兼容旧调用，实际由 OcrProcessor 管理版面引擎。"""
			
 
				+        if self.ocr_processor is None:
			
 
				+            return None
			
 
				+        return self.ocr_processor._get_layout_engine()
			
 
				 
			
 
				     def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
			
 
				         """
			
@@ -91,6 +160,7 @@ class PdfStructureExtractor:
 
				             try:
			
 
				                 catalog = self._extract_catalog(file_content, progress_callback)
			
 
				                 if catalog:
			
 
				+                    catalog = self._normalize_catalog(catalog)
			
 
				                     result["catalog"] = catalog
			
 
				                     logger.info(f"[PDF提取] 目录提取完成: {catalog.get('total_chapters', 0)} 章")
			
 
				             except Exception as e:
			
@@ -100,6 +170,29 @@ class PdfStructureExtractor:
 
				         doc = fitz.open(stream=file_content)
			
 
				         try:
			
 
				             structure = self._extract_from_doc(doc, progress_callback)
			
 
				+            if result.get("catalog"):
			
 
				+                # 正文抽取和目录检测是两条独立链路：
			
 
				+                # 1. 正文抽取更容易拿到连续 content
			
 
				+                # 2. 目录检测更容易保留顺序和层级
			
 
				+                # 这里先用目录骨架对齐正文，再按标题边界重建内容，尽量减少漏标题造成的结构缺失。
			
 
				+                structure["chapters"] = self._reconcile_structure_with_catalog(
			
 
				+                    structure.get("chapters", {}),
			
 
				+                    result["catalog"],
			
 
				+                )
			
 
				+                rebuilt_chapters = self._rebuild_section_contents_from_catalog(
			
 
				+                    structure.get("chapters", {}),
			
 
				+                    result["catalog"],
			
 
				+                    structure.get("_body_lines", []),
			
 
				+                )
			
 
				+                if rebuilt_chapters:
			
 
				+                    structure["chapters"] = rebuilt_chapters
			
 
				+                enriched_catalog = self._enrich_catalog_with_structure(
			
 
				+                    result["catalog"],
			
 
				+                    structure.get("chapters", {}),
			
 
				+                )
			
 
				+                if enriched_catalog:
			
 
				+                    result["catalog"] = enriched_catalog
			
 
				+            structure.pop("_body_lines", None)
			
 
				             result["chapters"] = structure.get("chapters", {})
			
 
				             result["total_pages"] = len(doc)
			
 
				             return result
			
@@ -117,31 +210,22 @@ class PdfStructureExtractor:
 
				         from .toc_detector import TOCCatalogExtractor
			
 
				 
			
 
				         if self._toc_extractor is None:
			
 
				-            # 使用 OCR 处理器的配置（如果已初始化）
			
 
				-            ocr_config = {}
			
 
				-            if self._ocr_processor:
			
 
				-                ocr_config = {
			
 
				-                    "ocr_api_url": self._ocr_processor.ocr_api_url,
			
 
				-                    "ocr_api_key": self._ocr_processor.ocr_api_key,
			
 
				-                    "ocr_timeout": self._ocr_processor.ocr_timeout,
			
 
				-                }
			
 
				             self._toc_extractor = TOCCatalogExtractor(
			
 
				                 model_path=self.toc_model_path,
			
 
				-                **ocr_config
			
 
				+                ocr_api_url=self.ocr_api_url,
			
 
				+                ocr_api_key=self.ocr_api_key,
			
 
				+                ocr_timeout=self.ocr_timeout,
			
 
				             )
			
 
				 
			
 
				         return self._toc_extractor.detect_and_extract(file_content, progress_callback)
			
 
				 
			
 
				     def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
			
 
				-        """
			
 
				-        提取文档结构（支持 OCR 异步并发）- 带坐标的精准回填方案。
			
 
				-
			
 
				-        流程：
			
 
				-        1. 提取带坐标的文本块
			
 
				-        2. 章节标题匹配 + 块归属划分
			
 
				-        3. 扫描表格区域并 OCR
			
 
				-        4. 根据表格坐标，将其作为新的块插入到对应小节
			
 
				-        5. 将每个小节的块列表按顺序拼接成纯文本输出
			
 
				+        """提取文档结构（支持 OCR 异步并发）。
			
 
				+
			
 
				+        整体分三步：
			
 
				+        1. 先扫描页面，找出需要 OCR 替换的表格区域
			
 
				+        2. 并发执行 OCR，并把识别结果按页回填
			
 
				+        3. 重新遍历页面文本，按标题规则切出 chapter / section 结构
			
 
				         """
			
 
				 
			
 
				         def _emit_progress(stage: str, current: int, message: str):
			
@@ -152,38 +236,106 @@ class PdfStructureExtractor:
 
				                 except Exception:
			
 
				                     pass
			
 
				 
			
 
				-        total_pages = len(doc)
			
 
				+        # === 阶段1: 收集所有需要 OCR 的表格区域 ===
			
 
				+        table_regions: List[TableRegion] = []
			
 
				+
			
 
				+        if self.use_ocr:
			
 
				+            logger.info("[OCR预处理] 扫描所有页面的表格区域...")
			
 
				+            total_pages = len(doc)
			
 
				+            for page_num in range(total_pages):
			
 
				+                page = doc.load_page(page_num)
			
 
				+                rect = page.rect
			
 
				+                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				+                regions = self._detect_table_regions(page, page_num + 1, clip_box)
			
 
				+                for bbox, score in regions:
			
 
				+                    table_regions.append(TableRegion(
			
 
				+                        page_num=page_num + 1,
			
 
				+                        page=page,
			
 
				+                        bbox=bbox,
			
 
				+                        score=score
			
 
				+                    ))
			
 
				+                # 每5页或最后一页推送一次进度
			
 
				+                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
			
 
				+                    progress = int((page_num + 1) / total_pages * 30)  # OCR预处理占30%进度
			
 
				+                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
			
 
				+            logger.info(f"[OCR预处理] 共发现 {len(table_regions)} 个表格区域需要 OCR")
			
 
				+
			
 
				+        # === 阶段2: 异步并发执行 OCR (5并发) ===
			
 
				+        ocr_results: List[OcrResult] = []
			
 
				 
			
 
				-        # ==================== 阶段1: 提取带坐标的文本块并归属到章节/小节====================
			
 
				-        logger.info("[阶段1] 提取带坐标的文本块并归属章节...")
			
 
				+        if table_regions:
			
 
				+            logger.info(f"[OCR执行] 使用 {self.OCR_CONCURRENT_WORKERS} 并发执行 OCR...")
			
 
				+            _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格，开始OCR识别...")
			
 
				+            ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
			
 
				+            success_count = sum(1 for r in ocr_results if r.success)
			
 
				+            logger.info(f"[OCR执行] 完成 {success_count}/{len(table_regions)} 个表格 OCR")
			
 
				+            _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
			
 
				 
			
 
				-        # 数据结构: {(chapter_name, section_name): [blocks_with_position]}
			
 
				-        chapter_blocks: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
			
 
				+        # 按页码分组 OCR 结果
			
 
				+        ocr_by_page: Dict[int, List[OcrResult]] = {}
			
 
				+        for result in ocr_results:
			
 
				+            if result.success:
			
 
				+                if result.page_num not in ocr_by_page:
			
 
				+                    ocr_by_page[result.page_num] = []
			
 
				+                ocr_by_page[result.page_num].append(result)
			
 
				+
			
 
				+        # === 阶段3: 提取页面文本（应用 OCR 结果）并切分章节 ===
			
 
				+        structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				+        # body_lines 保留过滤页眉页脚后的线性正文，后续目录回填时会再次按标题边界切段。
			
 
				+        body_lines: List[Dict[str, Any]] = []
			
 
				         current_chapter = "未分类前言"
			
 
				         current_section = "默认部分"
			
 
				         in_body = False
			
 
				+        candidate_rule_names: Optional[List[str]] = None
			
 
				+        active_rule_name: Optional[str] = None
			
 
				 
			
 
				-        for page_num in range(total_pages):
			
 
				+        logger.info("[文本提取] 提取页面内容并切分章节...")
			
 
				+
			
 
				+        for page_num in range(len(doc)):
			
 
				             page = doc.load_page(page_num)
			
 
				             rect = page.rect
			
 
				             clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				 
			
 
				-            # 获取带坐标的文本块
			
 
				-            blocks = self._extract_text_blocks_with_position(page, clip_box)
			
 
				-
			
 
				-            for block in blocks:
			
 
				-                line = block["text"]
			
 
				-
			
 
				-                # 跳过空行和页眉页脚
			
 
				-                if not line.strip():
			
 
				+            # 获取页面文本（应用 OCR 结果）
			
 
				+            if page_num + 1 in ocr_by_page:
			
 
				+                original_text = page.get_text("text", clip=clip_box)
			
 
				+                ocr_results_list = [
			
 
				+                    {
			
 
				+                        "region_index": i,
			
 
				+                        "bbox": r.bbox,
			
 
				+                        "score": r.score,
			
 
				+                        "ocr_text": r.text,
			
 
				+                    }
			
 
				+                    for i, r in enumerate(ocr_by_page[page_num + 1])
			
 
				+                ]
			
 
				+                text = self._replace_table_regions(page, original_text, ocr_results_list, clip_box)
			
 
				+            else:
			
 
				+                text = page.get_text("text", clip=clip_box)
			
 
				+
			
 
				+            lines = self._prepare_page_lines(text)
			
 
				+            for line in lines:
			
 
				+                if not line or self._is_header_footer(line):
			
 
				+                    continue
			
 
				+                body_lines.append({
			
 
				+                    "page": page_num + 1,
			
 
				+                    "text": line,
			
 
				+                })
			
 
				+
			
 
				+            for line in lines:
			
 
				+                line = line.strip()
			
 
				+                if not line:
			
 
				                     continue
			
 
				                 if self._is_header_footer(line):
			
 
				                     continue
			
 
				 
			
 
				                 # 跳过目录阶段
			
 
				                 if not in_body:
			
 
				-                    if self.CHAPTER_PATTERN.match(line) and not self.TOC_PATTERN.search(line):
			
 
				+                    # 只有首次遇到真正的一级标题后，才认为进入正文。
			
 
				+                    # 这样可以避免目录页虽然命中标题规则，却被误当成正文结构。
			
 
				+                    matched_rules = self._matching_rule_names(line, "l1")
			
 
				+                    if matched_rules and not self.TOC_PATTERN.search(line):
			
 
				                         in_body = True
			
 
				+                        candidate_rule_names = matched_rules
			
 
				                     else:
			
 
				                         continue
			
 
				 
			
@@ -191,291 +343,1367 @@ class PdfStructureExtractor:
 
				                 if self.TOC_PATTERN.search(line):
			
 
				                     continue
			
 
				 
			
 
				+                # candidate_rule_names 表示“这篇文档可能使用的标题体系”；
			
 
				+                # active_rule_name 表示“已经确认正在使用的二级标题规则”。
			
 
				+                # 先宽松候选、后收敛到单一规则，可以减少混合编号文档里的串匹配。
			
 
				+                active_scope = [active_rule_name] if active_rule_name else candidate_rule_names
			
 
				+
			
 
				                 # 匹配章标题
			
 
				-                if self.CHAPTER_PATTERN.match(line):
			
 
				+                matched_chapter_rules = self._matching_rule_names(line, "l1", active_scope)
			
 
				+                if matched_chapter_rules:
			
 
				+                    if active_rule_name is None:
			
 
				+                        candidate_rule_names = matched_chapter_rules
			
 
				                     current_chapter = self._clean_chapter_title(line)
			
 
				                     current_section = "章节标题"
			
 
				-                    key = (current_chapter, current_section)
			
 
				-                    if key not in chapter_blocks:
			
 
				-                        chapter_blocks[key] = []
			
 
				-                    chapter_blocks[key].append(block)
			
 
				+                    if current_chapter not in structured_data:
			
 
				+                        structured_data[current_chapter] = {}
			
 
				+                    if current_section not in structured_data[current_chapter]:
			
 
				+                        structured_data[current_chapter][current_section] = {
			
 
				+                            "lines": [],
			
 
				+                            "page_start": page_num + 1,
			
 
				+                            "page_end": page_num + 1,
			
 
				+                        }
			
 
				                     continue
			
 
				 
			
 
				                 # 匹配节标题
			
 
				-                if self.SECTION_PATTERN.match(line):
			
 
				-                    current_section = line
			
 
				-                    key = (current_chapter, current_section)
			
 
				-                    if key not in chapter_blocks:
			
 
				-                        chapter_blocks[key] = []
			
 
				-                    chapter_blocks[key].append(block)
			
 
				+                matched_section_rules = self._matching_rule_names(line, "l2", active_scope)
			
 
				+                if matched_section_rules:
			
 
				+                    if active_rule_name is None:
			
 
				+                        if candidate_rule_names:
			
 
				+                            for rule_name in candidate_rule_names:
			
 
				+                                if rule_name in matched_section_rules:
			
 
				+                                    active_rule_name = rule_name
			
 
				+                                    break
			
 
				+                        if active_rule_name is None:
			
 
				+                            active_rule_name = matched_section_rules[0]
			
 
				+                    current_section = self._clean_section_title(line)
			
 
				+                    if current_chapter not in structured_data:
			
 
				+                        structured_data[current_chapter] = {}
			
 
				+                    if current_section not in structured_data[current_chapter]:
			
 
				+                        structured_data[current_chapter][current_section] = {
			
 
				+                            "lines": [],
			
 
				+                            "page_start": page_num + 1,
			
 
				+                            "page_end": page_num + 1,
			
 
				+                        }
			
 
				                     continue
			
 
				 
			
 
				-                # 普通内容块
			
 
				-                key = (current_chapter, current_section)
			
 
				-                if key not in chapter_blocks:
			
 
				-                    chapter_blocks[key] = []
			
 
				-                chapter_blocks[key].append(block)
			
 
				+                # 确保结构存在
			
 
				+                if current_chapter not in structured_data:
			
 
				+                    structured_data[current_chapter] = {}
			
 
				+                if current_section not in structured_data[current_chapter]:
			
 
				+                    structured_data[current_chapter][current_section] = {
			
 
				+                        "lines": [],
			
 
				+                        "page_start": page_num + 1,
			
 
				+                        "page_end": page_num + 1,
			
 
				+                    }
			
 
				 
			
 
				-        logger.info(f"[阶段1] 章节结构提取完成，共 {len({k[0] for k in chapter_blocks})} 个章节")
			
 
				+                # 添加内容
			
 
				+                structured_data[current_chapter][current_section]["lines"].append(line)
			
 
				+                structured_data[current_chapter][current_section]["page_end"] = page_num + 1
			
 
				+
			
 
				+        # 将行列表拼接为文本
			
 
				+        result: Dict[str, Any] = {"chapters": {}, "_body_lines": body_lines}
			
 
				+        for chap, sections in structured_data.items():
			
 
				+            result["chapters"][chap] = {}
			
 
				+            for sec, data in sections.items():
			
 
				+                result["chapters"][chap][sec] = {
			
 
				+                    "content": "\n".join(data["lines"]),
			
 
				+                    "page_start": data["page_start"],
			
 
				+                    "page_end": data["page_end"],
			
 
				+                }
			
 
				 
			
 
				-        # ==================== 阶段2: 收集表格区域并OCR（如果启用OCR）====================
			
 
				-        table_regions: List[TableRegion] = []
			
 
				-        ocr_results: List[OcrResult] = []
			
 
				+        logger.info(f"[PdfExtractor] 提取完成，共 {len(result['chapters'])} 个章节")
			
 
				+        return result
			
 
				 
			
 
				-        if self.use_ocr and self._ocr_processor:
			
 
				-            logger.info("[阶段2] 扫描表格区域...")
			
 
				-            for page_num in range(total_pages):
			
 
				-                page = doc.load_page(page_num)
			
 
				-                rect = page.rect
			
 
				-                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				-                regions = self._ocr_processor.detect_table_regions(page, page_num + 1, clip_box)
			
 
				-                for bbox, score in regions:
			
 
				-                    table_regions.append(TableRegion(
			
 
				-                        page_num=page_num + 1,
			
 
				-                        page=page,
			
 
				-                        bbox=bbox,
			
 
				-                        score=score
			
 
				-                    ))
			
 
				-                # 每5页推送进度
			
 
				-                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
			
 
				-                    progress = int((page_num + 1) / total_pages * 30)
			
 
				-                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
			
 
				+    def _normalize_catalog(self, catalog: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+        """统一目录来源并择优合并。
			
 
				 
			
 
				-            logger.info(f"[阶段2] 发现 {len(table_regions)} 个表格区域")
			
 
				+        目录检测器输出的 chapters 更像“骨架”，raw_ocr_text 更接近页面原文。
			
 
				+        这里会分别解析两份结果，判断谁更可信，再做一次合并补齐。
			
 
				+        """
			
 
				+        if not catalog:
			
 
				+            return {}
			
 
				+
			
 
				+        normalized = dict(catalog)
			
 
				+        existing_chapters = self._sanitize_catalog_chapters(catalog.get("chapters", []))
			
 
				+        raw_text = catalog.get("raw_ocr_text", "")
			
 
				+        parsed_chapters = self._parse_catalog_from_raw_text(raw_text) if isinstance(raw_text, str) else []
			
 
				+        selected_chapters = existing_chapters
			
 
				+
			
 
				+        if parsed_chapters:
			
 
				+            if self._should_prefer_parsed_catalog(parsed_chapters, existing_chapters):
			
 
				+                selected_chapters = parsed_chapters
			
 
				+            elif existing_chapters:
			
 
				+                logger.info(
			
 
				+                    "[PDF提取] raw_ocr_text目录解析结果异常，保留原始目录骨架: "
			
 
				+                    f"parsed={len(parsed_chapters)}, original={len(existing_chapters)}"
			
 
				+                )
			
 
				+            else:
			
 
				+                selected_chapters = parsed_chapters
			
 
				 
			
 
				-            # 执行OCR
			
 
				-            if table_regions:
			
 
				-                _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格，开始OCR识别...")
			
 
				-                ocr_results = self._ocr_processor.process_ocr_concurrent(
			
 
				-                    table_regions,
			
 
				-                    progress_callback=lambda completed, total: _emit_progress(
			
 
				-                        "版面分析", 35 + int(completed / total * 15), f"OCR识别中 {completed}/{total}"
			
 
				-                    )
			
 
				+        if selected_chapters:
			
 
				+            selected_chapters = self._merge_catalog_chapters(
			
 
				+                selected_chapters,
			
 
				+                parsed_chapters,
			
 
				+            )
			
 
				+            normalized["chapters"] = selected_chapters
			
 
				+            normalized["total_chapters"] = len(selected_chapters)
			
 
				+            normalized["formatted_text"] = self._format_catalog_chapters(selected_chapters)
			
 
				+        return normalized
			
 
				+
			
 
				+    def _parse_catalog_from_raw_text(self, text: str) -> List[Dict[str, Any]]:
			
 
				+        """把目录页 OCR 原文解析成章节树。
			
 
				+
			
 
				+        解析时会先根据首批命中的一级标题推断文档的目录样式，
			
 
				+        后续再尽量沿用同一套规则收敛二级标题，避免不同编号体系互相污染。
			
 
				+        """
			
 
				+        if not text or not text.strip():
			
 
				+            return []
			
 
				+
			
 
				+        chapters: List[Dict[str, Any]] = []
			
 
				+        current_chapter: Optional[Dict[str, Any]] = None
			
 
				+        active_l2_rule: Optional[str] = None
			
 
				+        document_l1_rules: Optional[List[str]] = None
			
 
				+
			
 
				+        for raw_line in self._prepare_catalog_raw_lines(text):
			
 
				+            title_text, page = self._split_catalog_entry(raw_line)
			
 
				+            if not title_text:
			
 
				+                continue
			
 
				+
			
 
				+            compact = re.sub(r"\s+", "", title_text)
			
 
				+            if compact in {"目录", "目錄"}:
			
 
				+                continue
			
 
				+
			
 
				+            chapter_matches = self._matching_rule_names(title_text, "l1", document_l1_rules)
			
 
				+            if chapter_matches:
			
 
				+                if document_l1_rules is None:
			
 
				+                    document_l1_rules = chapter_matches
			
 
				+                current_chapter = {
			
 
				+                    "index": len(chapters) + 1,
			
 
				+                    "title": self._clean_chapter_title(title_text),
			
 
				+                    "page": str(page or 1),
			
 
				+                    "original": raw_line.strip(),
			
 
				+                    "subsections": [],
			
 
				+                }
			
 
				+                chapters.append(current_chapter)
			
 
				+                active_l2_rule = None
			
 
				+                continue
			
 
				+
			
 
				+            if current_chapter is None:
			
 
				+                continue
			
 
				+
			
 
				+            section_matches = self._matching_rule_names(title_text, "l2")
			
 
				+            if not section_matches:
			
 
				+                numeric_section_title = self._coerce_numeric_catalog_section(
			
 
				+                    title_text,
			
 
				+                    document_l1_rules,
			
 
				+                    active_l2_rule,
			
 
				                 )
			
 
				-                success_count = sum(1 for r in ocr_results if r.success)
			
 
				-                logger.info(f"[阶段2] OCR完成 {success_count}/{len(table_regions)}")
			
 
				-                _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
			
 
				-
			
 
				-        # ==================== 阶段3: 将OCR结果作为新块插入到对应章节====================
			
 
				-        if ocr_results:
			
 
				-            logger.info("[阶段3] 将OCR结果回填到对应章节...")
			
 
				-            self._insert_ocr_blocks_into_chapters(chapter_blocks, ocr_results)
			
 
				-
			
 
				-        # ==================== 阶段4: 生成最终输出（块列表转纯文本）====================
			
 
				-        logger.info("[阶段4] 生成最终文本输出...")
			
 
				-        result: Dict[str, Any] = {"chapters": {}}
			
 
				-
			
 
				-        for (chap_name, sec_name), blocks in chapter_blocks.items():
			
 
				-            if chap_name not in result["chapters"]:
			
 
				-                result["chapters"][chap_name] = {}
			
 
				-
			
 
				-            # 按页码和Y坐标排序块
			
 
				-            blocks.sort(key=lambda b: (b["page"], b["bbox"][1]))
			
 
				-
			
 
				-            # 拼接文本
			
 
				-            lines = []
			
 
				-            page_start = blocks[0]["page"] if blocks else 1
			
 
				-            page_end = blocks[-1]["page"] if blocks else 1
			
 
				-
			
 
				-            for block in blocks:
			
 
				-                if block.get("type") == "table":
			
 
				-                    lines.append(f"\n[表格OCR识别结果]:\n{block['text']}\n[/表格]\n")
			
 
				-                else:
			
 
				-                    lines.append(block["text"])
			
 
				+                if numeric_section_title:
			
 
				+                    section_key = self._normalize_heading_key(numeric_section_title)
			
 
				+                    existing_keys = {
			
 
				+                        self._normalize_heading_key(sub.get("title", ""))
			
 
				+                        for sub in current_chapter.get("subsections", [])
			
 
				+                    }
			
 
				+                    if section_key not in existing_keys:
			
 
				+                        current_chapter["subsections"].append({
			
 
				+                            "title": numeric_section_title,
			
 
				+                            "page": str(page or current_chapter.get("page", 1)),
			
 
				+                            "level": 2,
			
 
				+                            "original": raw_line.strip(),
			
 
				+                        })
			
 
				+                continue
			
 
				 
			
 
				-            result["chapters"][chap_name][sec_name] = {
			
 
				-                "content": "\n".join(lines),
			
 
				-                "page_start": page_start,
			
 
				-                "page_end": page_end,
			
 
				+            if active_l2_rule is None:
			
 
				+                active_l2_rule = section_matches[0]
			
 
				+            if active_l2_rule not in section_matches:
			
 
				+                continue
			
 
				+
			
 
				+            section_title = self._clean_section_title(title_text)
			
 
				+            section_key = self._normalize_heading_key(section_title)
			
 
				+            existing_keys = {
			
 
				+                self._normalize_heading_key(sub.get("title", ""))
			
 
				+                for sub in current_chapter.get("subsections", [])
			
 
				             }
			
 
				+            if section_key in existing_keys:
			
 
				+                continue
			
 
				 
			
 
				-        logger.info(f"[PdfExtractor] 提取完成，共 {len(result['chapters'])} 个章节")
			
 
				-        return result
			
 
				+            current_chapter["subsections"].append({
			
 
				+                "title": section_title,
			
 
				+                "page": str(page or current_chapter.get("page", 1)),
			
 
				+                "level": 2,
			
 
				+                "original": raw_line.strip(),
			
 
				+            })
			
 
				 
			
 
				-    def _extract_text_blocks_with_position(
			
 
				-        self,
			
 
				-        page: fitz.Page,
			
 
				-        clip_box: fitz.Rect
			
 
				+        return chapters
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _sanitize_catalog_chapters(cls, chapters: Any) -> List[Dict[str, Any]]:
			
 
				+        if not isinstance(chapters, list):
			
 
				+            return []
			
 
				+
			
 
				+        sanitized: List[Dict[str, Any]] = []
			
 
				+        seen_chapter_keys: Set[str] = set()
			
 
				+
			
 
				+        for idx, chapter in enumerate(chapters, 1):
			
 
				+            if not isinstance(chapter, dict):
			
 
				+                continue
			
 
				+
			
 
				+            chapter_title = cls._clean_chapter_title(str(chapter.get("title", "") or ""))
			
 
				+            chapter_key = cls._normalize_heading_key(chapter_title)
			
 
				+            if not chapter_key or chapter_key in seen_chapter_keys:
			
 
				+                continue
			
 
				+
			
 
				+            seen_chapter_keys.add(chapter_key)
			
 
				+            chapter_page = str(chapter.get("page") or idx)
			
 
				+            subsections: List[Dict[str, Any]] = []
			
 
				+            seen_section_keys: Set[str] = set()
			
 
				+
			
 
				+            for subsection in chapter.get("subsections", []) or []:
			
 
				+                if not isinstance(subsection, dict):
			
 
				+                    continue
			
 
				+
			
 
				+                section_title = cls._clean_section_title(str(subsection.get("title", "") or ""))
			
 
				+                section_key = cls._normalize_heading_key(section_title)
			
 
				+                if not section_key or section_key in seen_section_keys:
			
 
				+                    continue
			
 
				+
			
 
				+                seen_section_keys.add(section_key)
			
 
				+                subsections.append({
			
 
				+                    "title": section_title,
			
 
				+                    "page": str(subsection.get("page") or chapter_page),
			
 
				+                    "level": 2,
			
 
				+                    "original": subsection.get("original", "") or section_title,
			
 
				+                })
			
 
				+
			
 
				+            sanitized.append({
			
 
				+                "index": len(sanitized) + 1,
			
 
				+                "title": chapter_title,
			
 
				+                "page": chapter_page,
			
 
				+                "original": chapter.get("original", "") or chapter_title,
			
 
				+                "subsections": subsections,
			
 
				+            })
			
 
				+
			
 
				+        return sanitized
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _prepare_catalog_raw_lines(cls, text: str) -> List[str]:
			
 
				+        raw_lines = [line.strip() for line in text.splitlines() if line.strip()]
			
 
				+        prepared: List[str] = []
			
 
				+        index = 0
			
 
				+
			
 
				+        while index < len(raw_lines):
			
 
				+            current = raw_lines[index].strip()
			
 
				+            compact_current = re.sub(r"\s+", "", current)
			
 
				+
			
 
				+            if compact_current in {"目", "錄", "录"} and index + 1 < len(raw_lines):
			
 
				+                next_compact = re.sub(r"\s+", "", raw_lines[index + 1].strip())
			
 
				+                if compact_current + next_compact in {"目录", "目錄"}:
			
 
				+                    prepared.append(compact_current + next_compact)
			
 
				+                    index += 2
			
 
				+                    continue
			
 
				+
			
 
				+            if cls._is_incomplete_heading_fragment(current) and index + 1 < len(raw_lines):
			
 
				+                next_line = raw_lines[index + 1].strip()
			
 
				+                candidate = f"{current} {next_line}".strip()
			
 
				+                _, candidate_page = cls._split_catalog_entry(candidate)
			
 
				+                if (
			
 
				+                    cls._matching_rule_names(candidate, "l1")
			
 
				+                    or cls._matching_rule_names(candidate, "l2")
			
 
				+                    or candidate_page is not None
			
 
				+                ):
			
 
				+                    prepared.append(candidate)
			
 
				+                    index += 2
			
 
				+                    continue
			
 
				+
			
 
				+            prepared.append(current)
			
 
				+            index += 1
			
 
				+
			
 
				+        return prepared
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _should_prefer_parsed_catalog(
			
 
				+        cls,
			
 
				+        parsed_chapters: List[Dict[str, Any]],
			
 
				+        existing_chapters: List[Dict[str, Any]],
			
 
				+    ) -> bool:
			
 
				+        if not parsed_chapters:
			
 
				+            return False
			
 
				+
			
 
				+        parsed_is_suspicious = cls._catalog_has_suspicious_structure(parsed_chapters)
			
 
				+        existing_is_suspicious = cls._catalog_has_suspicious_structure(existing_chapters)
			
 
				+
			
 
				+        if parsed_is_suspicious:
			
 
				+            if not existing_chapters or not existing_is_suspicious:
			
 
				+                return False
			
 
				+
			
 
				+            parsed_score = cls._catalog_structure_score(parsed_chapters)
			
 
				+            existing_score = cls._catalog_structure_score(existing_chapters)
			
 
				+            overlap_ratio = cls._catalog_chapter_overlap_ratio(parsed_chapters, existing_chapters)
			
 
				+            return overlap_ratio >= 0.6 and parsed_score > existing_score
			
 
				+
			
 
				+        if not existing_chapters:
			
 
				+            return True
			
 
				+
			
 
				+        if existing_is_suspicious:
			
 
				+            return True
			
 
				+
			
 
				+        if cls._should_prefer_single_level_parsed_catalog(parsed_chapters, existing_chapters):
			
 
				+            return True
			
 
				+
			
 
				+        parsed_score = cls._catalog_structure_score(parsed_chapters)
			
 
				+        existing_score = cls._catalog_structure_score(existing_chapters)
			
 
				+        if parsed_score <= existing_score:
			
 
				+            return False
			
 
				+
			
 
				+        if not cls._catalog_has_suspicious_structure(existing_chapters):
			
 
				+            existing_count = len(existing_chapters)
			
 
				+            parsed_count = len(parsed_chapters)
			
 
				+            if parsed_count > max(existing_count * 2, existing_count + 8):
			
 
				+                return False
			
 
				+            if existing_count >= 4 and parsed_count < max(2, existing_count // 2):
			
 
				+                return False
			
 
				+
			
 
				+        return True
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _should_prefer_single_level_parsed_catalog(
			
 
				+        cls,
			
 
				+        parsed_chapters: List[Dict[str, Any]],
			
 
				+        existing_chapters: List[Dict[str, Any]],
			
 
				+    ) -> bool:
			
 
				+        """特判“单层目录被误识别成一章多节”的场景。"""
			
 
				+        if len(parsed_chapters) < 2 or len(existing_chapters) != 1:
			
 
				+            return False
			
 
				+
			
 
				+        if any(chapter.get("subsections") for chapter in parsed_chapters):
			
 
				+            return False
			
 
				+
			
 
				+        existing_subsections = existing_chapters[0].get("subsections", []) or []
			
 
				+        if len(existing_subsections) < len(parsed_chapters) - 1:
			
 
				+            return False
			
 
				+
			
 
				+        parsed_pages = [
			
 
				+            cls._safe_page_number(chapter.get("page"), 1)
			
 
				+            for chapter in parsed_chapters
			
 
				+        ]
			
 
				+        return parsed_pages == sorted(parsed_pages)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _catalog_has_suspicious_structure(cls, chapters: List[Dict[str, Any]]) -> bool:
			
 
				+        if not chapters:
			
 
				+            return False
			
 
				+
			
 
				+        titles = [(chapter.get("title", "") or "").strip() for chapter in chapters]
			
 
				+        chinese_chapter_count = sum(
			
 
				+            1 for title in titles
			
 
				+            if re.match(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]", title)
			
 
				+        )
			
 
				+        numeric_heading_count = sum(
			
 
				+            1 for title in titles
			
 
				+            if re.match(r"^\d{1,2}(?:[\.．。、])?\s+\S+", title)
			
 
				+        )
			
 
				+        embedded_numeric_body_count = 0
			
 
				+        repeated_chapter_no_count = 0
			
 
				+        reversed_chapter_no_count = 0
			
 
				+        seen_chapter_numbers: Set[str] = set()
			
 
				+        previous_numeric_chapter_no: Optional[int] = None
			
 
				+
			
 
				+        for title in titles:
			
 
				+            chapter_match = re.match(
			
 
				+                r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]\s*(.*)$",
			
 
				+                title,
			
 
				+            )
			
 
				+            if not chapter_match:
			
 
				+                continue
			
 
				+
			
 
				+            chapter_no = re.sub(r"\s+", "", chapter_match.group(1))
			
 
				+            chapter_body = (chapter_match.group(2) or "").strip()
			
 
				+            if chapter_no in seen_chapter_numbers:
			
 
				+                repeated_chapter_no_count += 1
			
 
				+            seen_chapter_numbers.add(chapter_no)
			
 
				+
			
 
				+            if chapter_no.isdigit():
			
 
				+                current_numeric_no = int(chapter_no)
			
 
				+                if previous_numeric_chapter_no is not None and current_numeric_no < previous_numeric_chapter_no:
			
 
				+                    reversed_chapter_no_count += 1
			
 
				+                previous_numeric_chapter_no = current_numeric_no
			
 
				+
			
 
				+            if re.match(r"^\d{1,2}(?:\.\d{1,2})*\.?(?:\s+|$)", chapter_body):
			
 
				+                embedded_numeric_body_count += 1
			
 
				+
			
 
				+        if chinese_chapter_count >= 2 and numeric_heading_count >= max(3, chinese_chapter_count // 2):
			
 
				+            return True
			
 
				+
			
 
				+        if chinese_chapter_count >= max(2, len(titles) // 3) and numeric_heading_count >= max(2, len(titles) // 6):
			
 
				+            return True
			
 
				+
			
 
				+        if embedded_numeric_body_count >= max(2, len(titles) // 5):
			
 
				+            return True
			
 
				+
			
 
				+        if repeated_chapter_no_count > 0 or reversed_chapter_no_count > 0:
			
 
				+            return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _catalog_structure_score(chapters: List[Dict[str, Any]]) -> int:
			
 
				+        score = 0
			
 
				+        for chapter in chapters:
			
 
				+            score += 1
			
 
				+            score += len(chapter.get("subsections", []) or [])
			
 
				+        return score
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _catalog_chapter_overlap_ratio(
			
 
				+        cls,
			
 
				+        chapters_a: List[Dict[str, Any]],
			
 
				+        chapters_b: List[Dict[str, Any]],
			
 
				+    ) -> float:
			
 
				+        if not chapters_a or not chapters_b:
			
 
				+            return 0.0
			
 
				+
			
 
				+        keys_a = {
			
 
				+            cls._catalog_chapter_identity_key(chapter.get("title", ""))
			
 
				+            for chapter in chapters_a
			
 
				+            if chapter.get("title")
			
 
				+        }
			
 
				+        keys_b = {
			
 
				+            cls._catalog_chapter_identity_key(chapter.get("title", ""))
			
 
				+            for chapter in chapters_b
			
 
				+            if chapter.get("title")
			
 
				+        }
			
 
				+        if not keys_a or not keys_b:
			
 
				+            return 0.0
			
 
				+
			
 
				+        return len(keys_a & keys_b) / max(1, min(len(keys_a), len(keys_b)))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _catalog_chapter_identity_key(cls, title: str) -> str:
			
 
				+        cleaned = cls._clean_chapter_title(title)
			
 
				+        if not cleaned:
			
 
				+            return ""
			
 
				+
			
 
				+        chapter_match = re.match(
			
 
				+            r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]\s*(.*)$",
			
 
				+            cleaned,
			
 
				+        )
			
 
				+        if chapter_match:
			
 
				+            chapter_body = cls._normalize_heading_key(chapter_match.group(1))
			
 
				+            if chapter_body:
			
 
				+                return chapter_body
			
 
				+
			
 
				+        numeric_match = re.match(r"^\d{1,2}(?:[\.．。、])?\s*(.*)$", cleaned)
			
 
				+        if numeric_match:
			
 
				+            numeric_body = cls._normalize_heading_key(numeric_match.group(1))
			
 
				+            if numeric_body:
			
 
				+                return numeric_body
			
 
				+
			
 
				+        return cls._normalize_heading_key(cleaned)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _merge_catalog_chapters(
			
 
				+        cls,
			
 
				+        base_chapters: List[Dict[str, Any]],
			
 
				+        supplemental_chapters: List[Dict[str, Any]],
			
 
				     ) -> List[Dict[str, Any]]:
			
 
				-        """
			
 
				-        提取带坐标的文本块列表。
			
 
				+        if not base_chapters:
			
 
				+            return supplemental_chapters or []
			
 
				+        if not supplemental_chapters:
			
 
				+            return base_chapters
			
 
				+
			
 
				+        merged: List[Dict[str, Any]] = []
			
 
				+        supplemental_by_key = {
			
 
				+            cls._catalog_chapter_identity_key(chapter.get("title", "")): chapter
			
 
				+            for chapter in supplemental_chapters
			
 
				+            if chapter.get("title")
			
 
				+        }
			
 
				+
			
 
				+        for index, chapter in enumerate(base_chapters, 1):
			
 
				+            chapter_copy = {
			
 
				+                **chapter,
			
 
				+                "subsections": [dict(sub) for sub in chapter.get("subsections", []) or []],
			
 
				+            }
			
 
				+            chapter_key = cls._catalog_chapter_identity_key(chapter_copy.get("title", ""))
			
 
				+            supplemental = supplemental_by_key.get(chapter_key)
			
 
				+            if supplemental:
			
 
				+                merged_subsections = cls._merge_catalog_subsections(
			
 
				+                    chapter_copy.get("subsections", []),
			
 
				+                    supplemental.get("subsections", []) or [],
			
 
				+                )
			
 
				+                chapter_copy["subsections"] = merged_subsections
			
 
				+            chapter_copy["index"] = index
			
 
				+            merged.append(chapter_copy)
			
 
				 
			
 
				-        使用 page.get_text("dict") 获取每个文本块的精确边界框和文本内容。
			
 
				-        """
			
 
				-        blocks = []
			
 
				-        page_dict = page.get_text("dict", clip=clip_box)
			
 
				-
			
 
				-        for block in page_dict.get("blocks", []):
			
 
				-            if block.get("type") == 0:  # 文本块
			
 
				-                bbox = block["bbox"]
			
 
				-                y_center = (bbox[1] + bbox[3]) / 2
			
 
				-
			
 
				-                # 拼接块内所有文本
			
 
				-                text_lines = []
			
 
				-                for line in block.get("lines", []):
			
 
				-                    line_text = ""
			
 
				-                    for span in line.get("spans", []):
			
 
				-                        line_text += span.get("text", "")
			
 
				-                    if line_text.strip():
			
 
				-                        text_lines.append(line_text)
			
 
				-
			
 
				-                if text_lines:
			
 
				-                    blocks.append({
			
 
				-                        "text": "\n".join(text_lines),
			
 
				-                        "page": page.number + 1,
			
 
				-                        "bbox": bbox,
			
 
				-                        "y_center": y_center,
			
 
				-                        "type": "text"
			
 
				-                    })
			
 
				-
			
 
				-        # 按阅读顺序排序（Y坐标为主，X坐标为辅）
			
 
				-        blocks.sort(key=lambda b: (b["page"], b["bbox"][1], b["bbox"][0]))
			
 
				-        return blocks
			
 
				-
			
 
				-    def _insert_ocr_blocks_into_chapters(
			
 
				+        return merged
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _merge_catalog_subsections(
			
 
				+        cls,
			
 
				+        base_subsections: List[Dict[str, Any]],
			
 
				+        supplemental_subsections: List[Dict[str, Any]],
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        if not base_subsections:
			
 
				+            return [dict(sub) for sub in supplemental_subsections]
			
 
				+        if not supplemental_subsections:
			
 
				+            return [dict(sub) for sub in base_subsections]
			
 
				+
			
 
				+        def _subsection_score(items: List[Dict[str, Any]]) -> int:
			
 
				+            score = 0
			
 
				+            for item in items:
			
 
				+                title = (item.get("title", "") or "").strip()
			
 
				+                if not title:
			
 
				+                    continue
			
 
				+                score += 1
			
 
				+                if re.match(r"^\d+\.\d+(?!\.\d)\.?\s*", title):
			
 
				+                    score += 3
			
 
				+                elif re.match(r"^(第\s*[一二三四五六七八九十百零两]+\s*节)", title):
			
 
				+                    score += 3
			
 
				+                elif re.match(r"^([一二三四五六七八九十百零两]+[、）\)\]])", title):
			
 
				+                    score += 3
			
 
				+                elif re.match(r"^[【\[]\s*\d+\s*[\]】]", title):
			
 
				+                    score += 3
			
 
				+                elif re.match(r"^\d{1,2}[\.．。、]\s*", title):
			
 
				+                    score += 1
			
 
				+            return score
			
 
				+
			
 
				+        base_score = _subsection_score(base_subsections)
			
 
				+        supplemental_score = _subsection_score(supplemental_subsections)
			
 
				+        if supplemental_score > base_score:
			
 
				+            return [dict(sub) for sub in supplemental_subsections]
			
 
				+
			
 
				+        merged = [dict(sub) for sub in base_subsections]
			
 
				+        seen_keys = {
			
 
				+            cls._normalize_heading_key(sub.get("title", ""))
			
 
				+            for sub in merged
			
 
				+            if sub.get("title")
			
 
				+        }
			
 
				+        for subsection in supplemental_subsections:
			
 
				+            subsection_key = cls._normalize_heading_key(subsection.get("title", ""))
			
 
				+            if not subsection_key or subsection_key in seen_keys:
			
 
				+                continue
			
 
				+            merged.append(dict(subsection))
			
 
				+            seen_keys.add(subsection_key)
			
 
				+        return merged
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _coerce_numeric_catalog_section(
			
 
				+        cls,
			
 
				+        title_text: str,
			
 
				+        document_l1_rules: Optional[List[str]],
			
 
				+        active_l2_rule: Optional[str],
			
 
				+    ) -> Optional[str]:
			
 
				+        if active_l2_rule is not None:
			
 
				+            return None
			
 
				+
			
 
				+        if not document_l1_rules:
			
 
				+            return None
			
 
				+
			
 
				+        if "Rule_1_纯数字派" in document_l1_rules:
			
 
				+            return None
			
 
				+
			
 
				+        if re.match(r"^\d{1,2}(?:[\.．。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].*", title_text.strip()):
			
 
				+            return cls._clean_section_title(title_text)
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _split_catalog_entry(line: str) -> Tuple[str, Optional[int]]:
			
 
				+        cleaned = line.strip()
			
 
				+        if not cleaned:
			
 
				+            return "", None
			
 
				+
			
 
				+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				+        page_match = re.search(
			
 
				+            r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*(\d+)\s*[-\u2013\u2014 ]*$",
			
 
				+            cleaned,
			
 
				+        )
			
 
				+        if page_match:
			
 
				+            title_text = cleaned[:page_match.start()].strip()
			
 
				+            title_text = re.sub(r"[.\u2026\u00b7\u2022 ]+$", "", title_text).strip()
			
 
				+            return title_text, int(page_match.group(1))
			
 
				+
			
 
				+        return cleaned, None
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _format_catalog_chapters(chapters: List[Dict[str, Any]]) -> str:
			
 
				+        lines: List[str] = []
			
 
				+        for chapter in chapters:
			
 
				+            title = chapter.get("title", "").strip()
			
 
				+            if not title:
			
 
				+                continue
			
 
				+            lines.append(title)
			
 
				+            for sub in chapter.get("subsections", []):
			
 
				+                sub_title = sub.get("title", "").strip()
			
 
				+                if sub_title:
			
 
				+                    lines.append(f"  {sub_title}")
			
 
				+        return "\n".join(lines)
			
 
				+
			
 
				+    def _enrich_catalog_with_structure(
			
 
				         self,
			
 
				-        chapter_blocks: Dict[Tuple[str, str], List[Dict[str, Any]]],
			
 
				-        ocr_results: List[OcrResult]
			
 
				-    ) -> None:
			
 
				+        catalog: Dict[str, Any],
			
 
				+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				+    ) -> Dict[str, Any]:
			
 
				+        catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
			
 
				+        if not catalog_chapters or not chapters:
			
 
				+            return catalog
			
 
				+
			
 
				+        enriched = dict(catalog)
			
 
				+        structure_items = list(chapters.items())
			
 
				+        structure_by_key = {
			
 
				+            self._catalog_chapter_identity_key(chapter_title): (chapter_title, sections)
			
 
				+            for chapter_title, sections in structure_items
			
 
				+        }
			
 
				+        used_structure_keys: Set[str] = set()
			
 
				+
			
 
				+        enriched_chapters: List[Dict[str, Any]] = []
			
 
				+        for catalog_chapter in catalog_chapters:
			
 
				+            chapter_copy = dict(catalog_chapter)
			
 
				+            chapter_key = self._catalog_chapter_identity_key(chapter_copy.get("title", ""))
			
 
				+            structure_match = structure_by_key.get(chapter_key)
			
 
				+            if structure_match is None:
			
 
				+                enriched_chapters.append(chapter_copy)
			
 
				+                continue
			
 
				+
			
 
				+            structure_title, structure_sections = structure_match
			
 
				+            used_structure_keys.add(chapter_key)
			
 
				+            title_payload = structure_sections.get("章节标题", {})
			
 
				+            chapter_copy["title"] = structure_title
			
 
				+            chapter_copy["content"] = title_payload.get("content", "")
			
 
				+            chapter_copy["page_start"] = title_payload.get("page_start", self._safe_page_number(chapter_copy.get("page")))
			
 
				+            chapter_copy["page_end"] = title_payload.get("page_end", chapter_copy["page_start"])
			
 
				+
			
 
				+            structure_subsections = [
			
 
				+                (section_title, payload)
			
 
				+                for section_title, payload in structure_sections.items()
			
 
				+                if section_title != "章节标题"
			
 
				+            ]
			
 
				+            catalog_subsections = chapter_copy.get("subsections", []) or []
			
 
				+            subsection_by_key = {
			
 
				+                self._normalize_heading_key(subsection.get("title", "")): subsection
			
 
				+                for subsection in catalog_subsections
			
 
				+                if subsection.get("title")
			
 
				+            }
			
 
				+
			
 
				+            enriched_subsections: List[Dict[str, Any]] = []
			
 
				+            for section_title, payload in structure_subsections:
			
 
				+                section_key = self._normalize_heading_key(section_title)
			
 
				+                subsection = dict(subsection_by_key.get(section_key, {}))
			
 
				+                subsection.setdefault("title", section_title)
			
 
				+                subsection.setdefault("page", str(payload.get("page_start", chapter_copy["page_start"])))
			
 
				+                subsection.setdefault("level", 2)
			
 
				+                subsection.setdefault("original", section_title)
			
 
				+                subsection["content"] = payload.get("content", "")
			
 
				+                subsection["page_start"] = payload.get("page_start", chapter_copy["page_start"])
			
 
				+                subsection["page_end"] = payload.get("page_end", subsection["page_start"])
			
 
				+                enriched_subsections.append(subsection)
			
 
				+
			
 
				+            chapter_copy["subsections"] = enriched_subsections
			
 
				+            enriched_chapters.append(chapter_copy)
			
 
				+
			
 
				+        existing_catalog_keys = {
			
 
				+            self._catalog_chapter_identity_key(chapter.get("title", ""))
			
 
				+            for chapter in enriched_chapters
			
 
				+            if chapter.get("title")
			
 
				+        }
			
 
				+        for chapter_title, structure_sections in structure_items:
			
 
				+            chapter_key = self._catalog_chapter_identity_key(chapter_title)
			
 
				+            if chapter_key in existing_catalog_keys or chapter_key in used_structure_keys:
			
 
				+                continue
			
 
				+
			
 
				+            title_payload = structure_sections.get("章节标题", {})
			
 
				+            new_chapter = {
			
 
				+                "index": len(enriched_chapters) + 1,
			
 
				+                "title": chapter_title,
			
 
				+                "page": str(title_payload.get("page_start", 1)),
			
 
				+                "original": chapter_title,
			
 
				+                "content": title_payload.get("content", ""),
			
 
				+                "page_start": title_payload.get("page_start", 1),
			
 
				+                "page_end": title_payload.get("page_end", title_payload.get("page_start", 1)),
			
 
				+                "subsections": [],
			
 
				+            }
			
 
				+            for section_title, payload in structure_sections.items():
			
 
				+                if section_title == "章节标题":
			
 
				+                    continue
			
 
				+                new_chapter["subsections"].append({
			
 
				+                    "title": section_title,
			
 
				+                    "page": str(payload.get("page_start", new_chapter["page_start"])),
			
 
				+                    "level": 2,
			
 
				+                    "original": section_title,
			
 
				+                    "content": payload.get("content", ""),
			
 
				+                    "page_start": payload.get("page_start", new_chapter["page_start"]),
			
 
				+                    "page_end": payload.get("page_end", payload.get("page_start", new_chapter["page_start"])),
			
 
				+                })
			
 
				+            enriched_chapters.append(new_chapter)
			
 
				+
			
 
				+        for index, chapter in enumerate(enriched_chapters, 1):
			
 
				+            chapter["index"] = index
			
 
				+
			
 
				+        enriched["chapters"] = enriched_chapters
			
 
				+        enriched["total_chapters"] = len(enriched_chapters)
			
 
				+        enriched["formatted_text"] = self._format_catalog_chapters(enriched_chapters)
			
 
				+        return enriched
			
 
				+
			
 
				+    def _reconcile_structure_with_catalog(
			
 
				+        self,
			
 
				+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				+        catalog: Dict[str, Any],
			
 
				+    ) -> Dict[str, Dict[str, Dict[str, Any]]]:
			
 
				+        """把正文抽取结果挂回目录骨架。
			
 
				+
			
 
				+        正文抽取结果通常 content 更完整，但层级可能漏掉；
			
 
				+        目录结果层级更稳，但 content 为空或不完整。
			
 
				+        这里按标题归一化后顺序匹配，把正文内容重新映射回目录结构。
			
 
				         """
			
 
				-        将OCR结果作为新的块插入到对应章节。
			
 
				+        catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
			
 
				+        if not chapters or not catalog_chapters:
			
 
				+            return chapters
			
 
				+
			
 
				+        section_title_key = "章节标题"
			
 
				+        # 将正文结构拆成“章标题内容”和“所有节标题内容”两条索引，方便后续按目录顺序逐项匹配。
			
 
				+        chapter_title_payloads: Dict[str, List[Dict[str, Any]]] = {}
			
 
				+        flat_sections: List[Tuple[str, Dict[str, Any]]] = []
			
 
				+        matched_chapter_count = 0
			
 
				+        matched_section_count = 0
			
 
				+        total_catalog_sections = 0
			
 
				+
			
 
				+        for chapter_title, sections in chapters.items():
			
 
				+            title_key = self._normalize_heading_key(chapter_title)
			
 
				+            title_payload = sections.get(section_title_key)
			
 
				+            if title_payload is not None:
			
 
				+                chapter_title_payloads.setdefault(title_key, []).append({
			
 
				+                    "content": title_payload.get("content", ""),
			
 
				+                    "page_start": title_payload.get("page_start", 1),
			
 
				+                    "page_end": title_payload.get("page_end", title_payload.get("page_start", 1)),
			
 
				+                })
			
 
				+
			
 
				+            for section_title, payload in sections.items():
			
 
				+                if section_title == section_title_key:
			
 
				+                    continue
			
 
				+                flat_sections.append((
			
 
				+                    self._normalize_heading_key(section_title),
			
 
				+                    {
			
 
				+                        "content": payload.get("content", ""),
			
 
				+                        "page_start": payload.get("page_start", 1),
			
 
				+                        "page_end": payload.get("page_end", payload.get("page_start", 1)),
			
 
				+                    },
			
 
				+                ))
			
 
				+
			
 
				+        rebuilt: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				+        # 优先按顺序向后匹配，找不到时再全局回退一次，兼顾正确率和容错性。
			
 
				+        search_start = 0
			
 
				+        used_indices = set()
			
 
				+
			
 
				+        for chapter in catalog_chapters:
			
 
				+            chapter_title = (chapter.get("title", "") or "").strip()
			
 
				+            if not chapter_title:
			
 
				+                continue
			
 
				+
			
 
				+            chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				+            chapter_key = self._normalize_heading_key(chapter_title)
			
 
				+            title_candidates = chapter_title_payloads.get(chapter_key, [])
			
 
				+            has_title_match = bool(title_candidates)
			
 
				+            title_payload = title_candidates.pop(0) if title_candidates else self._empty_section_payload(chapter_page)
			
 
				+            if has_title_match:
			
 
				+                matched_chapter_count += 1
			
 
				+
			
 
				+            rebuilt[chapter_title] = {
			
 
				+                section_title_key: title_payload,
			
 
				+            }
			
 
				+
			
 
				+            for subsection in chapter.get("subsections", []):
			
 
				+                section_title = (subsection.get("title", "") or "").strip()
			
 
				+                if not section_title:
			
 
				+                    continue
			
 
				+                total_catalog_sections += 1
			
 
				+
			
 
				+                target_key = self._normalize_heading_key(section_title)
			
 
				+                match_index = None
			
 
				+                for idx in range(search_start, len(flat_sections)):
			
 
				+                    if idx in used_indices:
			
 
				+                        continue
			
 
				+                    if flat_sections[idx][0] == target_key:
			
 
				+                        match_index = idx
			
 
				+                        break
			
 
				+                if match_index is None:
			
 
				+                    for idx, (section_key, _) in enumerate(flat_sections):
			
 
				+                        if idx in used_indices:
			
 
				+                            continue
			
 
				+                        if section_key == target_key:
			
 
				+                            match_index = idx
			
 
				+                            break
			
 
				+
			
 
				+                if match_index is not None:
			
 
				+                    used_indices.add(match_index)
			
 
				+                    search_start = max(search_start, match_index + 1)
			
 
				+                    rebuilt[chapter_title][section_title] = flat_sections[match_index][1]
			
 
				+                    matched_section_count += 1
			
 
				+                else:
			
 
				+                    rebuilt[chapter_title][section_title] = self._empty_section_payload(
			
 
				+                        self._safe_page_number(subsection.get("page"), chapter_page)
			
 
				+                    )
			
 
				+
			
 
				+        if total_catalog_sections > 0 and matched_section_count == 0:
			
 
				+            return chapters
			
 
				 
			
 
				-        策略：
			
 
				-        1. 找到表格Y坐标所在的页面
			
 
				-        2. 在该页面的所有小节中，找到表格Y坐标介于哪两个文本块之间
			
 
				-        3. 将OCR块插入到正确位置
			
 
				+        if matched_chapter_count == 0 and matched_section_count == 0:
			
 
				+            return chapters
			
 
				+
			
 
				+        return rebuilt or chapters
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _normalize_heading_key(text: str) -> str:
			
 
				+        normalized = PdfStructureExtractor._strip_catalog_page_suffix((text or "").strip())
			
 
				+        normalized = normalized.replace("【", "[").replace("】", "]")
			
 
				+        normalized = normalized.replace("（", "(").replace("）", ")")
			
 
				+        normalized = normalized.replace("．", ".").replace("。", ".")
			
 
				+        normalized = re.sub(r"\s+", "", normalized)
			
 
				+        return normalized
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _safe_page_number(value: Any, default: int = 1) -> int:
			
 
				+        try:
			
 
				+            return max(1, int(str(value).strip()))
			
 
				+        except Exception:
			
 
				+            return default
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _empty_section_payload(page_num: int) -> Dict[str, Any]:
			
 
				+        return {
			
 
				+            "content": "",
			
 
				+            "page_start": page_num,
			
 
				+            "page_end": page_num,
			
 
				+        }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _prepare_page_lines(cls, text: str) -> List[str]:
			
 
				+        """清洗页面文本行，并尝试把被换行拆开的标题重新合并。"""
			
 
				+        raw_lines = [line.strip() for line in text.split("\n") if line.strip()]
			
 
				+        prepared_lines: List[str] = []
			
 
				+        index = 0
			
 
				+
			
 
				+        while index < len(raw_lines):
			
 
				+            merged_line, consumed = cls._merge_heading_fragment(raw_lines, index)
			
 
				+            if merged_line:
			
 
				+                prepared_lines.append(merged_line)
			
 
				+                index += consumed
			
 
				+                continue
			
 
				+
			
 
				+            prepared_lines.append(raw_lines[index])
			
 
				+            index += 1
			
 
				+
			
 
				+        return prepared_lines
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _merge_heading_fragment(
			
 
				+        cls,
			
 
				+        lines: List[str],
			
 
				+        start_index: int,
			
 
				+    ) -> Tuple[Optional[str], int]:
			
 
				+        """尝试把当前位置开始的 2~3 行拼成完整标题。"""
			
 
				+        first_line = lines[start_index].strip()
			
 
				+        if not first_line:
			
 
				+            return None, 1
			
 
				+
			
 
				+        first_is_heading = bool(cls._matching_rule_names(first_line, "l1") or cls._matching_rule_names(first_line, "l2"))
			
 
				+        first_is_incomplete = cls._is_incomplete_heading_fragment(first_line)
			
 
				+        max_span = min(3, len(lines) - start_index)
			
 
				+
			
 
				+        for span in range(2, max_span + 1):
			
 
				+            candidate_lines = [lines[start_index + offset].strip() for offset in range(span)]
			
 
				+            candidate_text = " ".join(candidate_lines).strip()
			
 
				+            if not candidate_text or cls.TOC_PATTERN.search(candidate_text):
			
 
				+                continue
			
 
				+            if not (cls._matching_rule_names(candidate_text, "l1") or cls._matching_rule_names(candidate_text, "l2")):
			
 
				+                continue
			
 
				+            # 只有首行本身像“半截标题”，或者合并后明显更像标题时才吞并后续行，避免误吃正文。
			
 
				+            if first_is_incomplete or not first_is_heading:
			
 
				+                return candidate_text, span
			
 
				+
			
 
				+        return None, 1
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _is_incomplete_heading_fragment(line: str) -> bool:
			
 
				+        clean_line = re.sub(r"\s+", "", line.strip())
			
 
				+        if not clean_line:
			
 
				+            return False
			
 
				+
			
 
				+        fragment_patterns = (
			
 
				+            r"^第(?:\d+|[一二三四五六七八九十百零两]+)[章部分篇]$",
			
 
				+            r"^\d{1,2}(?:[\.．。、])$",
			
 
				+            r"^[【\[]\d+[\]】]$",
			
 
				+            r"^[一二三四五六七八九十百零两]+[、）\)\]]$",
			
 
				+            r"^第[一二三四五六七八九十百零两]+节$",
			
 
				+            r"^\d+\.\d+(?!\.\d)\.?$",
			
 
				+        )
			
 
				+        return any(re.match(pattern, clean_line) for pattern in fragment_patterns)
			
 
				+
			
 
				+    def _rebuild_section_contents_from_catalog(
			
 
				+        self,
			
 
				+        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				+        catalog: Dict[str, Any],
			
 
				+        body_lines: List[Dict[str, Any]],
			
 
				+    ) -> Dict[str, Dict[str, Dict[str, Any]]]:
			
 
				+        """基于目录顺序和正文行号，重新切分 section content。
			
 
				+
			
 
				+        当正文结构抽取漏掉部分标题时，直接使用结构化结果容易出现 content 缺段。
			
 
				+        这里把目录拍平成一条标题时间线，再在线性正文里定位这些标题，
			
 
				+        用“当前标题到下一个标题”之间的文本作为当前 section 的正文。
			
 
				         """
			
 
				-        # 按页码分组OCR结果
			
 
				-        ocr_by_page: Dict[int, List[OcrResult]] = {}
			
 
				-        for result in ocr_results:
			
 
				-            if result.success:
			
 
				-                if result.page_num not in ocr_by_page:
			
 
				-                    ocr_by_page[result.page_num] = []
			
 
				-                ocr_by_page[result.page_num].append(result)
			
 
				+        catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
			
 
				+        if not catalog_chapters or not body_lines:
			
 
				+            return chapters
			
 
				+
			
 
				+        # 先把目录展开成顺序列表，方便统一定位每个标题在正文中的起点。
			
 
				+        expected_items: List[Dict[str, Any]] = []
			
 
				+        total_sections = 0
			
 
				+        for chapter in catalog_chapters:
			
 
				+            chapter_title = (chapter.get("title", "") or "").strip()
			
 
				+            if not chapter_title:
			
 
				+                continue
			
 
				+            chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				+            expected_items.append({
			
 
				+                "kind": "chapter",
			
 
				+                "title": chapter_title,
			
 
				+                "chapter_title": chapter_title,
			
 
				+                "section_title": "章节标题",
			
 
				+                "page_hint": chapter_page,
			
 
				+                "line_index": None,
			
 
				+                "page": chapter_page,
			
 
				+            })
			
 
				+
			
 
				+            for subsection in chapter.get("subsections", []):
			
 
				+                section_title = (subsection.get("title", "") or "").strip()
			
 
				+                if not section_title:
			
 
				+                    continue
			
 
				+                total_sections += 1
			
 
				+                expected_items.append({
			
 
				+                    "kind": "section",
			
 
				+                    "title": section_title,
			
 
				+                    "chapter_title": chapter_title,
			
 
				+                    "section_title": section_title,
			
 
				+                    "page_hint": self._safe_page_number(subsection.get("page"), chapter_page),
			
 
				+                    "line_index": None,
			
 
				+                    "page": self._safe_page_number(subsection.get("page"), chapter_page),
			
 
				+                })
			
 
				+
			
 
				+        if not expected_items or total_sections == 0:
			
 
				+            return chapters
			
 
				+
			
 
				+        search_start = 0
			
 
				+        found_sections = 0
			
 
				+        for item in expected_items:
			
 
				+            line_index = self._find_heading_line_index(
			
 
				+                body_lines,
			
 
				+                item["title"],
			
 
				+                item["kind"],
			
 
				+                search_start,
			
 
				+            )
			
 
				+            item["line_index"] = line_index
			
 
				+            if line_index is not None:
			
 
				+                item["page"] = body_lines[line_index]["page"]
			
 
				+                search_start = line_index + 1
			
 
				+                if item["kind"] == "section":
			
 
				+                    found_sections += 1
			
 
				+
			
 
				+        if found_sections == 0:
			
 
				+            return chapters
			
 
				+
			
 
				+        rebuilt: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				+        section_title_key = "章节标题"
			
 
				+
			
 
				+        for chapter in catalog_chapters:
			
 
				+            chapter_title = (chapter.get("title", "") or "").strip()
			
 
				+            if not chapter_title:
			
 
				+                continue
			
 
				+
			
 
				+            chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				+            existing_sections = chapters.get(chapter_title, {})
			
 
				+            rebuilt[chapter_title] = {
			
 
				+                section_title_key: existing_sections.get(section_title_key, self._empty_section_payload(chapter_page))
			
 
				+            }
			
 
				+
			
 
				+            for subsection in chapter.get("subsections", []):
			
 
				+                section_title = (subsection.get("title", "") or "").strip()
			
 
				+                if not section_title:
			
 
				+                    continue
			
 
				+                rebuilt[chapter_title][section_title] = existing_sections.get(
			
 
				+                    section_title,
			
 
				+                    self._empty_section_payload(self._safe_page_number(subsection.get("page"), chapter_page)),
			
 
				+                )
			
 
				 
			
 
				-        # 处理每个包含表格的页面
			
 
				-        for page_num, ocr_list in ocr_by_page.items():
			
 
				-            # 找到该页面涉及的所有小节
			
 
				-            page_sections = []
			
 
				-            for (chap_name, sec_name), blocks in chapter_blocks.items():
			
 
				-                # 检查该小节是否包含该页面的块
			
 
				-                page_blocks = [b for b in blocks if b["page"] == page_num]
			
 
				-                if page_blocks:
			
 
				-                    page_sections.append({
			
 
				-                        "chapter": chap_name,
			
 
				-                        "section": sec_name,
			
 
				-                        "blocks": page_blocks,
			
 
				-                        "all_blocks": blocks,  # 引用原列表用于插入
			
 
				-                    })
			
 
				-
			
 
				-            if not page_sections:
			
 
				-                logger.warning(f"[OCR回填] 第{page_num}页没有匹配到任何小节")
			
 
				+        for idx, item in enumerate(expected_items):
			
 
				+            if item["kind"] != "section" or item["line_index"] is None:
			
 
				                 continue
			
 
				 
			
 
				-            # 处理每个OCR结果
			
 
				-            for ocr_result in sorted(ocr_list, key=lambda r: r.bbox[1]):
			
 
				-                table_y_top = ocr_result.bbox[1]
			
 
				-                table_y_bottom = ocr_result.bbox[3]
			
 
				-                ocr_text = ocr_result.text
			
 
				-
			
 
				-                # 构造表格块
			
 
				-                table_block = {
			
 
				-                    "text": ocr_text,
			
 
				-                    "page": page_num,
			
 
				-                    "bbox": ocr_result.bbox,
			
 
				-                    "y_center": (table_y_top + table_y_bottom) / 2,
			
 
				-                    "type": "table"
			
 
				-                }
			
 
				+            # 下一个已定位标题就是当前 section 的右边界；没有下一个则取到文末。
			
 
				+            next_heading_index = len(body_lines)
			
 
				+            for later in expected_items[idx + 1:]:
			
 
				+                if later["line_index"] is not None:
			
 
				+                    next_heading_index = later["line_index"]
			
 
				+                    break
			
 
				 
			
 
				-                # 找到目标小节
			
 
				-                target_section = None
			
 
				-                insert_index = -1
			
 
				+            content_entries = body_lines[item["line_index"] + 1:next_heading_index]
			
 
				+            content_text = "\n".join(entry["text"] for entry in content_entries).strip()
			
 
				+            existing_payload = rebuilt[item["chapter_title"]].get(item["section_title"], {})
			
 
				 
			
 
				-                for ps in page_sections:
			
 
				-                    # 获取该小节在该页面的所有块，按Y坐标排序
			
 
				-                    page_blocks = sorted(ps["blocks"], key=lambda b: b["bbox"][1])
			
 
				+            if not content_text and (existing_payload.get("content") or "").strip():
			
 
				+                continue
			
 
				 
			
 
				-                    if not page_blocks:
			
 
				-                        continue
			
 
				+            if content_entries:
			
 
				+                page_start = content_entries[0]["page"]
			
 
				+                page_end = content_entries[-1]["page"]
			
 
				+            else:
			
 
				+                page_start = item["page"]
			
 
				+                page_end = item["page"]
			
 
				 
			
 
				-                    # 找到表格应该插入的位置
			
 
				-                    # 策略：表格上边界位于哪个块之后
			
 
				-                    found = False
			
 
				-                    for i, block in enumerate(page_blocks):
			
 
				-                        block_y_bottom = block["bbox"][3]
			
 
				-                        if i < len(page_blocks) - 1:
			
 
				-                            next_y_top = page_blocks[i + 1]["bbox"][1]
			
 
				-                        else:
			
 
				-                            next_y_top = float('inf')
			
 
				-
			
 
				-                        # 如果表格位于当前块之后，且在下一块之前
			
 
				-                        if block_y_bottom <= table_y_top < next_y_top:
			
 
				-                            # 找到在原列表中的位置
			
 
				-                            try:
			
 
				-                                insert_index = ps["all_blocks"].index(block) + 1
			
 
				-                                target_section = ps
			
 
				-                                found = True
			
 
				-                                break
			
 
				-                            except ValueError:
			
 
				-                                continue
			
 
				-
			
 
				-                    # 如果表格在所有块之前
			
 
				-                    if not found and table_y_top < page_blocks[0]["bbox"][1]:
			
 
				-                        try:
			
 
				-                            insert_index = ps["all_blocks"].index(page_blocks[0])
			
 
				-                            target_section = ps
			
 
				-                            found = True
			
 
				-                        except ValueError:
			
 
				-                            continue
			
 
				+            rebuilt[item["chapter_title"]][item["section_title"]] = {
			
 
				+                "content": content_text,
			
 
				+                "page_start": page_start,
			
 
				+                "page_end": page_end,
			
 
				+            }
			
 
				 
			
 
				-                    # 如果表格在所有块之后
			
 
				-                    if not found and table_y_bottom > page_blocks[-1]["bbox"][3]:
			
 
				-                        try:
			
 
				-                            insert_index = ps["all_blocks"].index(page_blocks[-1]) + 1
			
 
				-                            target_section = ps
			
 
				-                            found = True
			
 
				-                        except ValueError:
			
 
				-                            continue
			
 
				+        return rebuilt or chapters
			
 
				 
			
 
				-                    if found:
			
 
				-                        break
			
 
				+    def _find_heading_line_index(
			
 
				+        self,
			
 
				+        body_lines: List[Dict[str, Any]],
			
 
				+        target_title: str,
			
 
				+        heading_kind: str,
			
 
				+        start_index: int,
			
 
				+    ) -> Optional[int]:
			
 
				+        """在线性正文中查找目标标题行。
			
 
				+
			
 
				+        先做归一化后的精确匹配；若 OCR / PDF 抽取给标题前面带了噪声前缀，
			
 
				+        再退一步做“候选行后缀等于目标标题”的宽松匹配。
			
 
				+        """
			
 
				+        target_key = self._normalize_heading_key(target_title)
			
 
				+        if not target_key:
			
 
				+            return None
			
 
				 
			
 
				-                # 执行插入
			
 
				-                if target_section and insert_index >= 0:
			
 
				-                    target_section["all_blocks"].insert(insert_index, table_block)
			
 
				-                    logger.debug(
			
 
				-                        f"[OCR回填] 第{page_num}页表格(Y={table_y_top:.0f}) -> "
			
 
				-                        f"{target_section['chapter']}/{target_section['section']} 位置{insert_index}"
			
 
				-                    )
			
 
				+        for index in range(start_index, len(body_lines)):
			
 
				+            candidate_text = (body_lines[index].get("text") or "").strip()
			
 
				+            if not candidate_text or self.TOC_PATTERN.search(candidate_text):
			
 
				+                continue
			
 
				+
			
 
				+            if heading_kind == "chapter":
			
 
				+                candidate_key = self._normalize_heading_key(self._clean_chapter_title(candidate_text))
			
 
				+            else:
			
 
				+                candidate_key = self._normalize_heading_key(self._clean_section_title(candidate_text))
			
 
				+
			
 
				+            if candidate_key == target_key:
			
 
				+                return index
			
 
				+
			
 
				+            raw_candidate_key = self._normalize_heading_key(candidate_text)
			
 
				+            # 某些 PDF 会把页码、序号或残余字符拼到标题前面，这里允许有限前缀噪声。
			
 
				+            if raw_candidate_key.endswith(target_key):
			
 
				+                prefix = raw_candidate_key[:-len(target_key)]
			
 
				+                if not prefix or re.fullmatch(
			
 
				+                    r"[\dA-Za-z\.\-_/|,:;()\[\]\u3001\u3002\uff0c\uff1a\uff1b\uff08\uff09\u3010\u3011]+",
			
 
				+                    prefix,
			
 
				+                ):
			
 
				+                    return index
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
			
 
				+        """同步并发处理 OCR，具体实现委托给 OcrProcessor。"""
			
 
				+        if self.ocr_processor is None:
			
 
				+            return []
			
 
				+
			
 
				+        if not progress_callback:
			
 
				+            return self.ocr_processor.process_ocr_concurrent(regions)
			
 
				+
			
 
				+        def _progress_adapter(completed: int, total: int):
			
 
				+            progress = 35 + int(completed / total * 15) if total else 50
			
 
				+            progress_callback("版面分析", progress, f"OCR识别中 {completed}/{total}")
			
 
				+
			
 
				+        return self.ocr_processor.process_ocr_concurrent(
			
 
				+            regions,
			
 
				+            progress_callback=_progress_adapter,
			
 
				+        )
			
 
				+
			
 
				+    def _detect_table_regions(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        page_num: int,
			
 
				+        clip_box: fitz.Rect
			
 
				+    ) -> List[Tuple[Tuple[float, float, float, float], float]]:
			
 
				+        """检测页面中的表格区域，具体实现委托给 OcrProcessor。"""
			
 
				+        if self.ocr_processor is None:
			
 
				+            return []
			
 
				+        return self.ocr_processor.detect_table_regions(page, page_num, clip_box)
			
 
				+
			
 
				+    def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float], max_retries: int = 3) -> str:
			
 
				+        """对指定区域进行 OCR 识别，具体实现委托给 OcrProcessor。"""
			
 
				+        if self.ocr_processor is None:
			
 
				+            raise RuntimeError("OCR processor is not initialized")
			
 
				+        return self.ocr_processor._ocr_table_region(page, bbox, max_retries=max_retries)
			
 
				+
			
 
				+    def _replace_table_regions(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        original_text: str,
			
 
				+        ocr_results: List[Dict],
			
 
				+        clip_box: fitz.Rect
			
 
				+    ) -> str:
			
 
				+        """用 OCR 结果替换原始文本中的表格区域。"""
			
 
				+        if self.ocr_processor is None:
			
 
				+            return original_text
			
 
				+        if not ocr_results:
			
 
				+            return original_text
			
 
				+
			
 
				+        # 这里保留章节提取场景的兼容逻辑：
			
 
				+        # 1. 标题块不参与表格替换，避免目录/章节标题被表格框误吞；
			
 
				+        # 2. 仅替换真正落入表格区域的正文块，保留表格前后的普通文本；
			
 
				+        # 3. OCR 返回空时退回原始 PDF 文本，避免整块内容被清空。
			
 
				+        text_blocks = []
			
 
				+        for block in page.get_text("blocks"):
			
 
				+            x0, y0, x1, y1, text, _, _ = block
			
 
				+            if y0 >= clip_box.y0 and y1 <= clip_box.y1:
			
 
				+                text_blocks.append({
			
 
				+                    "bbox": (x0, y0, x1, y1),
			
 
				+                    "text": text.strip(),
			
 
				+                })
			
 
				+
			
 
				+        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
			
 
				+
			
 
				+        if not text_blocks:
			
 
				+            return original_text
			
 
				+
			
 
				+        region_entries: List[Dict[str, Any]] = []
			
 
				+        replaced_indices: Set[int] = set()
			
 
				+
			
 
				+        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
			
 
				+            rx0, ry0, rx1, ry1 = ocr_result["bbox"]
			
 
				+            current_indices: List[int] = []
			
 
				+
			
 
				+            for idx, block in enumerate(text_blocks):
			
 
				+                if idx in replaced_indices:
			
 
				+                    continue
			
 
				+                if self._block_contains_heading(block["text"]):
			
 
				+                    continue
			
 
				+
			
 
				+                bx0, by0, bx1, by1 = block["bbox"]
			
 
				+                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
			
 
				+                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
			
 
				+                overlap_area = overlap_x * overlap_y
			
 
				+                block_area = max((bx1 - bx0) * (by1 - by0), 1)
			
 
				+
			
 
				+                if overlap_area / block_area > 0.5:
			
 
				+                    current_indices.append(idx)
			
 
				+
			
 
				+            if not current_indices:
			
 
				+                continue
			
 
				+
			
 
				+            replaced_indices.update(current_indices)
			
 
				+            region_entries.append({
			
 
				+                "start": min(current_indices),
			
 
				+                "end": max(current_indices),
			
 
				+                "ocr_text": (ocr_result.get("ocr_text") or "").strip(),
			
 
				+            })
			
 
				+
			
 
				+        if not region_entries:
			
 
				+            return original_text
			
 
				+
			
 
				+        region_by_start = {entry["start"]: entry for entry in region_entries}
			
 
				+        result_parts: List[str] = []
			
 
				+        idx = 0
			
 
				+
			
 
				+        while idx < len(text_blocks):
			
 
				+            region = region_by_start.get(idx)
			
 
				+            if region is not None:
			
 
				+                if region["ocr_text"]:
			
 
				+                    result_parts.append(region["ocr_text"])
			
 
				+                    result_parts.append("\n")
			
 
				                 else:
			
 
				-                    # 兜底：追加到该页面第一个小节末尾
			
 
				-                    if page_sections:
			
 
				-                        ps = page_sections[0]
			
 
				-                        ps["all_blocks"].append(table_block)
			
 
				-                        logger.warning(
			
 
				-                            f"[OCR回填] 第{page_num}页表格无法精确定位，追加到 {ps['chapter']}/{ps['section']}"
			
 
				-                        )
			
 
				+                    for block_idx in range(region["start"], region["end"] + 1):
			
 
				+                        block_text = text_blocks[block_idx]["text"]
			
 
				+                        if block_text:
			
 
				+                            result_parts.append(block_text)
			
 
				+                            result_parts.append("\n")
			
 
				+                idx = region["end"] + 1
			
 
				+                continue
			
 
				+
			
 
				+            if idx not in replaced_indices:
			
 
				+                block_text = text_blocks[idx]["text"]
			
 
				+                if block_text:
			
 
				+                    result_parts.append(block_text)
			
 
				+                    result_parts.append("\n")
			
 
				+            idx += 1
			
 
				+
			
 
				+        return "".join(result_parts).strip() or original_text
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _block_contains_heading(cls, text: str) -> bool:
			
 
				+        if not text or not text.strip():
			
 
				+            return False
			
 
				+
			
 
				+        for line in cls._prepare_page_lines(text):
			
 
				+            stripped = line.strip()
			
 
				+            if not stripped:
			
 
				+                continue
			
 
				+            if cls._matching_rule_names(stripped, "l1") or cls._matching_rule_names(stripped, "l2"):
			
 
				+                return True
			
 
				+        return False
			
 
				+
			
 
				+    def _compress_image(self, img_bytes: bytes) -> bytes:
			
 
				+        """压缩图片，具体实现委托给 OcrProcessor。"""
			
 
				+        if self.ocr_processor is None:
			
 
				+            return img_bytes
			
 
				+        return self.ocr_processor._compress_image(img_bytes)
			
 
				+
			
 
				+    def _extract_ocr_content(self, result: Dict) -> str:
			
 
				+        """从 OCR 响应提取内容，具体实现委托给 OcrProcessor。"""
			
 
				+        if self.ocr_processor is None:
			
 
				+            return ""
			
 
				+        return self.ocr_processor._extract_ocr_content(result)
			
 
				 
			
 
				     @staticmethod
			
 
				     def _is_header_footer(line: str) -> bool:
			
 
				+        compact_line = re.sub(r"\s+", "", line.strip())
			
 
				+        if not compact_line:
			
 
				+            return False
			
 
				+
			
 
				+        heading_prefix = re.match(
			
 
				+            r"^(第[\d一二三四五六七八九十百零两]+[章节部分篇]|[\d]+\.\d+|[\d]+[\.．。、]?|[一二三四五六七八九十百零两]+[、）\)\]]|第[一二三四五六七八九十百零两]+节|【\d+】)",
			
 
				+            compact_line,
			
 
				+        )
			
 
				+
			
 
				+        if compact_line.isdigit():
			
 
				+            return True
			
 
				+
			
 
				+        if (
			
 
				+            compact_line.endswith("有限责任公司")
			
 
				+            or compact_line.endswith("有限公司")
			
 
				+            or compact_line.endswith("股份有限公司")
			
 
				+        ) and not heading_prefix:
			
 
				+            return True
			
 
				+
			
 
				+        if compact_line.endswith("专项施工方案") and not heading_prefix:
			
 
				+            return True
			
 
				+
			
 
				         return (
			
 
				             "四川路桥建设集团股份有限公司" in line
			
 
				             or "T梁运输及安装专项施工方案" in line
			
 
				-            or line.isdigit()
			
 
				+            or (
			
 
				+                compact_line.endswith("工程项目")
			
 
				+                and len(compact_line) >= 8
			
 
				+                and not compact_line.startswith("第")
			
 
				+            )
			
 
				         )
			
 
				 
			
 
				+    @classmethod
			
 
				+    def _matching_rule_names(
			
 
				+        cls,
			
 
				+        line: str,
			
 
				+        level: str,
			
 
				+        rule_names: Optional[List[str]] = None,
			
 
				+    ) -> List[str]:
			
 
				+        clean_line = line.strip()
			
 
				+        if level == "l1":
			
 
				+            clean_line = cls._strip_leading_page_number_from_cn_chapter(clean_line)
			
 
				+        names = rule_names or list(cls.RULE_LIB.keys())
			
 
				+        return [
			
 
				+            rule_name
			
 
				+            for rule_name in names
			
 
				+            if cls.RULE_LIB[rule_name][level].match(clean_line)
			
 
				+        ]
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _matches_chapter_heading(cls, line: str, rule_names: Optional[List[str]] = None) -> bool:
			
 
				+        return bool(cls._matching_rule_names(line, "l1", rule_names))
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _matches_section_heading(cls, line: str, rule_names: Optional[List[str]] = None) -> bool:
			
 
				+        return bool(cls._matching_rule_names(line, "l2", rule_names))
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _strip_leading_page_number_from_cn_chapter(line: str) -> str:
			
 
				+        cleaned = re.sub(r"\s+", " ", line.strip())
			
 
				+        if not cleaned:
			
 
				+            return ""
			
 
				+
			
 
				+        return re.sub(
			
 
				+            r"^\d{1,3}\s+(?=第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇])",
			
 
				+            "",
			
 
				+            cleaned,
			
 
				+            count=1,
			
 
				+        ).strip()
			
 
				+
			
 
				     @staticmethod
			
 
				     def _clean_chapter_title(line: str) -> str:
			
 
				-        chapter_match = re.search(r"第[一二三四五六七八九十百]+章", line)
			
 
				-        if not chapter_match:
			
 
				-            return line.strip()
			
 
				-
			
 
				-        prefix = chapter_match.group(0)
			
 
				-        remaining = line[chapter_match.end() :].strip()
			
 
				-        remaining = re.sub(r"^[\.\s]+", "", remaining)
			
 
				-        remaining = re.sub(r"\s+\d+\s*$", "", remaining)
			
 
				-        remaining = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*", "", remaining)
			
 
				-
			
 
				-        if remaining:
			
 
				-            return f"{prefix} {remaining}"
			
 
				-        return prefix
			
 
				+        cleaned = PdfStructureExtractor._strip_leading_page_number_from_cn_chapter(line)
			
 
				+        cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
			
 
				+        cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
			
 
				+        cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
			
 
				+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				+
			
 
				+        cn_chapter_match = re.match(
			
 
				+            r"^(第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇])[\s、:：\.-]*(.*)$",
			
 
				+            cleaned,
			
 
				+        )
			
 
				+        if cn_chapter_match:
			
 
				+            prefix = re.sub(r"\s+", "", cn_chapter_match.group(1))
			
 
				+            title = cn_chapter_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        num_chapter_match = re.match(r"^(\d{1,2})(?:[\.．。、])?\s*(.*)$", cleaned)
			
 
				+        if num_chapter_match:
			
 
				+            prefix = num_chapter_match.group(1)
			
 
				+            title = num_chapter_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        return cleaned
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _clean_section_title(line: str) -> str:
			
 
				+        cleaned = line.strip()
			
 
				+        cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
			
 
				+        cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
			
 
				+        cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
			
 
				+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				+
			
 
				+        numeric_section_match = re.match(r"^(\d+\.\d+)(?!\.\d)\.?\s*(.*)$", cleaned)
			
 
				+        if numeric_section_match:
			
 
				+            prefix = numeric_section_match.group(1)
			
 
				+            title = numeric_section_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        numeric_list_match = re.match(r"^(\d{1,2})(?:[、\.\uFF0E\u3002\)\]\uFF09])\s*(.*)$", cleaned)
			
 
				+        if numeric_list_match:
			
 
				+            prefix = numeric_list_match.group(1)
			
 
				+            title = numeric_list_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        cn_section_match = re.match(r"^(第\s*[一二三四五六七八九十百零两]+\s*节)[\s、:：\.-]*(.*)$", cleaned)
			
 
				+        if cn_section_match:
			
 
				+            prefix = re.sub(r"\s+", "", cn_section_match.group(1))
			
 
				+            title = cn_section_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        cn_list_match = re.match(r"^([一二三四五六七八九十百零两]+[、）\)\]])[\s]*(.*)$", cleaned)
			
 
				+        if cn_list_match:
			
 
				+            prefix = cn_list_match.group(1).strip()
			
 
				+            title = cn_list_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        bracket_match = re.match(r"^([【\[]\s*\d+\s*[\]】])[\s]*(.*)$", cleaned)
			
 
				+        if bracket_match:
			
 
				+            prefix = re.sub(r"\s+", "", bracket_match.group(1))
			
 
				+            title = bracket_match.group(2).strip()
			
 
				+            return f"{prefix} {title}".strip()
			
 
				+
			
 
				+        return cleaned
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _strip_catalog_page_suffix(text: str) -> str:
			
 
				+        cleaned = re.sub(r"\s+", " ", (text or "").strip())
			
 
				+        if not cleaned:
			
 
				+            return ""
			
 
				+
			
 
				+        return re.sub(
			
 
				+            r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*\d+\s*[-\u2013\u2014 ]*$",
			
 
				+            "",
			
 
				+            cleaned,
			
 
				+        ).strip()
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor2.py
@@ -1,1865 +0,0 @@
 
				-"""
			
 
				-PDF 结构提取器 - 同步并发 OCR 版本
			
 
				-
			
 
				-基于 splitter_pdf 逻辑，直接提取章节结构并记录页码。
			
 
				-支持 OCR 增强：检测表格区域并使用 ThreadPoolExecutor 5并发 OCR，其他文本保持 PyMuPDF 提取。
			
 
				-输出格式兼容后续分类与组装流程。
			
 
				-"""
			
 
				-
			
 
				-import base64
			
 
				-import io
			
 
				-import re
			
 
				-from concurrent.futures import ThreadPoolExecutor, as_completed
			
 
				-from dataclasses import dataclass
			
 
				-from typing import Dict, Any, List, Optional, Tuple, Set
			
 
				-
			
 
				-import fitz
			
 
				-import numpy as np
			
 
				-import requests
			
 
				-
			
 
				-from foundation.observability.logger.loggering import review_logger as logger
			
 
				-
			
 
				-# 尝试导入 RapidLayout
			
 
				-try:
			
 
				-    from rapid_layout import RapidLayout
			
 
				-    RAPID_LAYOUT_AVAILABLE = True
			
 
				-except ImportError:
			
 
				-    RAPID_LAYOUT_AVAILABLE = False
			
 
				-    RapidLayout = None
			
 
				-
			
 
				-
			
 
				-@dataclass
			
 
				-class TableRegion:
			
 
				-    """表格区域信息"""
			
 
				-    page_num: int
			
 
				-    page: fitz.Page
			
 
				-    bbox: Tuple[float, float, float, float]
			
 
				-    score: float
			
 
				-
			
 
				-
			
 
				-@dataclass
			
 
				-class OcrResult:
			
 
				-    """OCR 结果"""
			
 
				-    page_num: int
			
 
				-    bbox: Tuple[float, float, float, float]
			
 
				-    score: float
			
 
				-    text: str
			
 
				-    success: bool
			
 
				-
			
 
				-
			
 
				-class PdfStructureExtractor:
			
 
				-    """PDF 章节结构提取器（支持 OCR 异步并发）"""
			
 
				-
			
 
				-    RULE_LIB = {
			
 
				-        "Rule_1_纯数字派": {
			
 
				-            "l1": re.compile(r"^\d{1,2}(?:[\.．。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].*"),
			
 
				-            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				-        },
			
 
				-        "Rule_2_混合章派": {
			
 
				-            "l1": re.compile(r"^第\s*(\d+)\s*[章部分篇][\s、]*(.*)"),
			
 
				-            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				-        },
			
 
				-        "Rule_3_中英混血派": {
			
 
				-            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				-            "l2": re.compile(r"^(\d+)\.(\d+)(?!\.\d)\.?\s*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				-        },
			
 
				-        "Rule_4_传统公文派": {
			
 
				-            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				-            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[、\s]+([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				-        },
			
 
				-        "Rule_5_单边括号派": {
			
 
				-            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				-            "l2": re.compile(r"^([一二三四五六七八九十百零两]+)[）\)\]][\s]*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				-        },
			
 
				-        "Rule_6_小节派": {
			
 
				-            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				-            "l2": re.compile(r"^第\s*([一二三四五六七八九十百零两]+)\s*节[\s、]*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				-        },
			
 
				-        "Rule_7_粗体括号派": {
			
 
				-            "l1": re.compile(r"^第\s*[一二三四五六七八九十百零两]+\s*[章部分篇][\s、]*(.*)"),
			
 
				-            "l2": re.compile(r"^[【\[]\s*(\d+)\s*[\]】][\s]*([\u4e00-\u9fa5A-Za-z].*)"),
			
 
				-        },
			
 
				-        "Rule_8_cn_list_l1_numeric_l2": {
			
 
				-            "l1": re.compile(
			
 
				-                r"^(?:[一二三四五六七八九十百零两]+)[、\)\]\uFF09]\s*[\u4e00-\u9fa5A-Za-z].*"
			
 
				-            ),
			
 
				-            "l2": re.compile(
			
 
				-                r"^\d{1,2}(?:[、\.\uFF0E\u3002\)\]\uFF09])\s*(?!\d)[\u4e00-\u9fa5A-Za-z].*"
			
 
				-            ),
			
 
				-        },
			
 
				-    }
			
 
				-    TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
			
 
				-
			
 
				-    # OCR 配置
			
 
				-    MAX_SHORT_EDGE = 1024
			
 
				-    JPEG_QUALITY = 90
			
 
				-    OCR_DPI = 200
			
 
				-    OCR_CONFIDENCE_THRESHOLD = 0.5
			
 
				-    OCR_CONCURRENT_WORKERS = 5
			
 
				-
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        clip_top: float = 60,
			
 
				-        clip_bottom: float = 60,
			
 
				-        use_ocr: bool = False,
			
 
				-        ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
			
 
				-        ocr_timeout: int = 600,
			
 
				-        ocr_api_key: str = "",
			
 
				-        detect_toc: bool = True,
			
 
				-        toc_model_path: str = "config/yolo/best.pt",
			
 
				-    ):
			
 
				-        self.clip_top = clip_top
			
 
				-        self.clip_bottom = clip_bottom
			
 
				-        self.use_ocr = use_ocr and RAPID_LAYOUT_AVAILABLE
			
 
				-
			
 
				-        # OCR 配置
			
 
				-        self.ocr_api_url = ocr_api_url
			
 
				-        self.ocr_timeout = ocr_timeout
			
 
				-        self.ocr_api_key = ocr_api_key
			
 
				-        self._layout_engine: Optional[Any] = None
			
 
				-
			
 
				-        # 目录检测配置
			
 
				-        self.detect_toc = detect_toc
			
 
				-        self.toc_model_path = toc_model_path
			
 
				-        self._toc_extractor = None
			
 
				-
			
 
				-        if use_ocr and not RAPID_LAYOUT_AVAILABLE:
			
 
				-            logger.warning("RapidLayout 未安装，OCR 功能不可用")
			
 
				-
			
 
				-    def _get_layout_engine(self) -> Optional[Any]:
			
 
				-        """延迟初始化 RapidLayout"""
			
 
				-        if self._layout_engine is None and RAPID_LAYOUT_AVAILABLE:
			
 
				-            self._layout_engine = RapidLayout()
			
 
				-        return self._layout_engine
			
 
				-
			
 
				-    def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
			
 
				-        """
			
 
				-        从 PDF 字节流提取章节结构。
			
 
				-
			
 
				-        Args:
			
 
				-            file_content: PDF 文件字节流
			
 
				-            progress_callback: 进度回调函数，接收 (stage, current, message) 参数
			
 
				-
			
 
				-        Returns:
			
 
				-            {
			
 
				-                "chapters": {
			
 
				-                    "第一章 xxx": {
			
 
				-                        "章节标题": {"content": "...", "page_start": 1, "page_end": 1},
			
 
				-                        "一、xxx": {"content": "...", "page_start": 2, "page_end": 3},
			
 
				-                    }
			
 
				-                },
			
 
				-                "total_pages": N,
			
 
				-                "catalog": {  # 目录结构（YOLO检测+OCR提取）
			
 
				-                    "chapters": [...],
			
 
				-                    "total_chapters": N
			
 
				-                }
			
 
				-            }
			
 
				-        """
			
 
				-        result = {"chapters": {}, "total_pages": 0, "catalog": None}
			
 
				-
			
 
				-        # === 阶段0: 目录页检测与提取（如果启用）===
			
 
				-        if self.detect_toc:
			
 
				-            try:
			
 
				-                catalog = self._extract_catalog(file_content, progress_callback)
			
 
				-                if catalog:
			
 
				-                    catalog = self._normalize_catalog(catalog)
			
 
				-                    result["catalog"] = catalog
			
 
				-                    logger.info(f"[PDF提取] 目录提取完成: {catalog.get('total_chapters', 0)} 章")
			
 
				-            except Exception as e:
			
 
				-                logger.warning(f"[PDF提取] 目录提取失败: {e}")
			
 
				-
			
 
				-        # === 阶段1-3: 文档结构提取 ===
			
 
				-        doc = fitz.open(stream=file_content)
			
 
				-        try:
			
 
				-            structure = self._extract_from_doc(doc, progress_callback)
			
 
				-            if result.get("catalog"):
			
 
				-                # 正文抽取和目录检测是两条独立链路：
			
 
				-                # 1. 正文抽取更容易拿到连续 content
			
 
				-                # 2. 目录检测更容易保留顺序和层级
			
 
				-                # 这里先用目录骨架对齐正文，再按标题边界重建内容，尽量减少漏标题造成的结构缺失。
			
 
				-                structure["chapters"] = self._reconcile_structure_with_catalog(
			
 
				-                    structure.get("chapters", {}),
			
 
				-                    result["catalog"],
			
 
				-                )
			
 
				-                rebuilt_chapters = self._rebuild_section_contents_from_catalog(
			
 
				-                    structure.get("chapters", {}),
			
 
				-                    result["catalog"],
			
 
				-                    structure.get("_body_lines", []),
			
 
				-                )
			
 
				-                if rebuilt_chapters:
			
 
				-                    structure["chapters"] = rebuilt_chapters
			
 
				-                enriched_catalog = self._enrich_catalog_with_structure(
			
 
				-                    result["catalog"],
			
 
				-                    structure.get("chapters", {}),
			
 
				-                )
			
 
				-                if enriched_catalog:
			
 
				-                    result["catalog"] = enriched_catalog
			
 
				-            structure.pop("_body_lines", None)
			
 
				-            result["chapters"] = structure.get("chapters", {})
			
 
				-            result["total_pages"] = len(doc)
			
 
				-            return result
			
 
				-        finally:
			
 
				-            doc.close()
			
 
				-
			
 
				-    def _extract_catalog(self, file_content: bytes, progress_callback=None) -> Optional[Dict[str, Any]]:
			
 
				-        """
			
 
				-        提取目录结构（YOLO检测 + OCR识别）
			
 
				-
			
 
				-        Returns:
			
 
				-            {"chapters": [...], "total_chapters": N} 或 None
			
 
				-        """
			
 
				-        # 延迟导入避免循环依赖（YOLO依赖必须存在，否则报错）
			
 
				-        from .toc_detector import TOCCatalogExtractor
			
 
				-
			
 
				-        if self._toc_extractor is None:
			
 
				-            self._toc_extractor = TOCCatalogExtractor(
			
 
				-                model_path=self.toc_model_path,
			
 
				-                ocr_api_url=self.ocr_api_url,
			
 
				-                ocr_api_key=self.ocr_api_key,
			
 
				-                ocr_timeout=self.ocr_timeout,
			
 
				-            )
			
 
				-
			
 
				-        return self._toc_extractor.detect_and_extract(file_content, progress_callback)
			
 
				-
			
 
				-    def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
			
 
				-        """提取文档结构（支持 OCR 异步并发）。
			
 
				-
			
 
				-        整体分三步：
			
 
				-        1. 先扫描页面，找出需要 OCR 替换的表格区域
			
 
				-        2. 并发执行 OCR，并把识别结果按页回填
			
 
				-        3. 重新遍历页面文本，按标题规则切出 chapter / section 结构
			
 
				-        """
			
 
				-
			
 
				-        def _emit_progress(stage: str, current: int, message: str):
			
 
				-            """发送进度回调"""
			
 
				-            if progress_callback:
			
 
				-                try:
			
 
				-                    progress_callback(stage, current, message)
			
 
				-                except Exception:
			
 
				-                    pass
			
 
				-
			
 
				-        # === 阶段1: 收集所有需要 OCR 的表格区域 ===
			
 
				-        table_regions: List[TableRegion] = []
			
 
				-
			
 
				-        if self.use_ocr:
			
 
				-            logger.info("[OCR预处理] 扫描所有页面的表格区域...")
			
 
				-            total_pages = len(doc)
			
 
				-            for page_num in range(total_pages):
			
 
				-                page = doc.load_page(page_num)
			
 
				-                rect = page.rect
			
 
				-                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				-                regions = self._detect_table_regions(page, page_num + 1, clip_box)
			
 
				-                for bbox, score in regions:
			
 
				-                    table_regions.append(TableRegion(
			
 
				-                        page_num=page_num + 1,
			
 
				-                        page=page,
			
 
				-                        bbox=bbox,
			
 
				-                        score=score
			
 
				-                    ))
			
 
				-                # 每5页或最后一页推送一次进度
			
 
				-                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
			
 
				-                    progress = int((page_num + 1) / total_pages * 30)  # OCR预处理占30%进度
			
 
				-                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
			
 
				-            logger.info(f"[OCR预处理] 共发现 {len(table_regions)} 个表格区域需要 OCR")
			
 
				-
			
 
				-        # === 阶段2: 异步并发执行 OCR (5并发) ===
			
 
				-        ocr_results: List[OcrResult] = []
			
 
				-
			
 
				-        if table_regions:
			
 
				-            logger.info(f"[OCR执行] 使用 {self.OCR_CONCURRENT_WORKERS} 并发执行 OCR...")
			
 
				-            _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格，开始OCR识别...")
			
 
				-            ocr_results = self._process_ocr_concurrent(table_regions, progress_callback=_emit_progress)
			
 
				-            success_count = sum(1 for r in ocr_results if r.success)
			
 
				-            logger.info(f"[OCR执行] 完成 {success_count}/{len(table_regions)} 个表格 OCR")
			
 
				-            _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
			
 
				-
			
 
				-        # 按页码分组 OCR 结果
			
 
				-        ocr_by_page: Dict[int, List[OcrResult]] = {}
			
 
				-        for result in ocr_results:
			
 
				-            if result.success:
			
 
				-                if result.page_num not in ocr_by_page:
			
 
				-                    ocr_by_page[result.page_num] = []
			
 
				-                ocr_by_page[result.page_num].append(result)
			
 
				-
			
 
				-        # === 阶段3: 提取页面文本（应用 OCR 结果）并切分章节 ===
			
 
				-        structured_data: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				-        # body_lines 保留过滤页眉页脚后的线性正文，后续目录回填时会再次按标题边界切段。
			
 
				-        body_lines: List[Dict[str, Any]] = []
			
 
				-        current_chapter = "未分类前言"
			
 
				-        current_section = "默认部分"
			
 
				-        in_body = False
			
 
				-        candidate_rule_names: Optional[List[str]] = None
			
 
				-        active_rule_name: Optional[str] = None
			
 
				-
			
 
				-        logger.info("[文本提取] 提取页面内容并切分章节...")
			
 
				-
			
 
				-        for page_num in range(len(doc)):
			
 
				-            page = doc.load_page(page_num)
			
 
				-            rect = page.rect
			
 
				-            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				-
			
 
				-            # 获取页面文本（应用 OCR 结果）
			
 
				-            if page_num + 1 in ocr_by_page:
			
 
				-                original_text = page.get_text("text", clip=clip_box)
			
 
				-                ocr_results_list = [
			
 
				-                    {
			
 
				-                        "region_index": i,
			
 
				-                        "bbox": r.bbox,
			
 
				-                        "score": r.score,
			
 
				-                        "ocr_text": r.text,
			
 
				-                    }
			
 
				-                    for i, r in enumerate(ocr_by_page[page_num + 1])
			
 
				-                ]
			
 
				-                text = self._replace_table_regions(page, original_text, ocr_results_list, clip_box)
			
 
				-            else:
			
 
				-                text = page.get_text("text", clip=clip_box)
			
 
				-
			
 
				-            lines = self._prepare_page_lines(text)
			
 
				-            for line in lines:
			
 
				-                if not line or self._is_header_footer(line):
			
 
				-                    continue
			
 
				-                body_lines.append({
			
 
				-                    "page": page_num + 1,
			
 
				-                    "text": line,
			
 
				-                })
			
 
				-
			
 
				-            for line in lines:
			
 
				-                line = line.strip()
			
 
				-                if not line:
			
 
				-                    continue
			
 
				-                if self._is_header_footer(line):
			
 
				-                    continue
			
 
				-
			
 
				-                # 跳过目录阶段
			
 
				-                if not in_body:
			
 
				-                    # 只有首次遇到真正的一级标题后，才认为进入正文。
			
 
				-                    # 这样可以避免目录页虽然命中标题规则，却被误当成正文结构。
			
 
				-                    matched_rules = self._matching_rule_names(line, "l1")
			
 
				-                    if matched_rules and not self.TOC_PATTERN.search(line):
			
 
				-                        in_body = True
			
 
				-                        candidate_rule_names = matched_rules
			
 
				-                    else:
			
 
				-                        continue
			
 
				-
			
 
				-                # 跳过残余目录格式
			
 
				-                if self.TOC_PATTERN.search(line):
			
 
				-                    continue
			
 
				-
			
 
				-                # candidate_rule_names 表示“这篇文档可能使用的标题体系”；
			
 
				-                # active_rule_name 表示“已经确认正在使用的二级标题规则”。
			
 
				-                # 先宽松候选、后收敛到单一规则，可以减少混合编号文档里的串匹配。
			
 
				-                active_scope = [active_rule_name] if active_rule_name else candidate_rule_names
			
 
				-
			
 
				-                # 匹配章标题
			
 
				-                matched_chapter_rules = self._matching_rule_names(line, "l1", active_scope)
			
 
				-                if matched_chapter_rules:
			
 
				-                    if active_rule_name is None:
			
 
				-                        candidate_rule_names = matched_chapter_rules
			
 
				-                    current_chapter = self._clean_chapter_title(line)
			
 
				-                    current_section = "章节标题"
			
 
				-                    if current_chapter not in structured_data:
			
 
				-                        structured_data[current_chapter] = {}
			
 
				-                    if current_section not in structured_data[current_chapter]:
			
 
				-                        structured_data[current_chapter][current_section] = {
			
 
				-                            "lines": [],
			
 
				-                            "page_start": page_num + 1,
			
 
				-                            "page_end": page_num + 1,
			
 
				-                        }
			
 
				-                    continue
			
 
				-
			
 
				-                # 匹配节标题
			
 
				-                matched_section_rules = self._matching_rule_names(line, "l2", active_scope)
			
 
				-                if matched_section_rules:
			
 
				-                    if active_rule_name is None:
			
 
				-                        if candidate_rule_names:
			
 
				-                            for rule_name in candidate_rule_names:
			
 
				-                                if rule_name in matched_section_rules:
			
 
				-                                    active_rule_name = rule_name
			
 
				-                                    break
			
 
				-                        if active_rule_name is None:
			
 
				-                            active_rule_name = matched_section_rules[0]
			
 
				-                    current_section = self._clean_section_title(line)
			
 
				-                    if current_chapter not in structured_data:
			
 
				-                        structured_data[current_chapter] = {}
			
 
				-                    if current_section not in structured_data[current_chapter]:
			
 
				-                        structured_data[current_chapter][current_section] = {
			
 
				-                            "lines": [],
			
 
				-                            "page_start": page_num + 1,
			
 
				-                            "page_end": page_num + 1,
			
 
				-                        }
			
 
				-                    continue
			
 
				-
			
 
				-                # 确保结构存在
			
 
				-                if current_chapter not in structured_data:
			
 
				-                    structured_data[current_chapter] = {}
			
 
				-                if current_section not in structured_data[current_chapter]:
			
 
				-                    structured_data[current_chapter][current_section] = {
			
 
				-                        "lines": [],
			
 
				-                        "page_start": page_num + 1,
			
 
				-                        "page_end": page_num + 1,
			
 
				-                    }
			
 
				-
			
 
				-                # 添加内容
			
 
				-                structured_data[current_chapter][current_section]["lines"].append(line)
			
 
				-                structured_data[current_chapter][current_section]["page_end"] = page_num + 1
			
 
				-
			
 
				-        # 将行列表拼接为文本
			
 
				-        result: Dict[str, Any] = {"chapters": {}, "_body_lines": body_lines}
			
 
				-        for chap, sections in structured_data.items():
			
 
				-            result["chapters"][chap] = {}
			
 
				-            for sec, data in sections.items():
			
 
				-                result["chapters"][chap][sec] = {
			
 
				-                    "content": "\n".join(data["lines"]),
			
 
				-                    "page_start": data["page_start"],
			
 
				-                    "page_end": data["page_end"],
			
 
				-                }
			
 
				-
			
 
				-        logger.info(f"[PdfExtractor] 提取完成，共 {len(result['chapters'])} 个章节")
			
 
				-        return result
			
 
				-
			
 
				-    def _normalize_catalog(self, catalog: Dict[str, Any]) -> Dict[str, Any]:
			
 
				-        """统一目录来源并择优合并。
			
 
				-
			
 
				-        目录检测器输出的 chapters 更像“骨架”，raw_ocr_text 更接近页面原文。
			
 
				-        这里会分别解析两份结果，判断谁更可信，再做一次合并补齐。
			
 
				-        """
			
 
				-        if not catalog:
			
 
				-            return {}
			
 
				-
			
 
				-        normalized = dict(catalog)
			
 
				-        existing_chapters = self._sanitize_catalog_chapters(catalog.get("chapters", []))
			
 
				-        raw_text = catalog.get("raw_ocr_text", "")
			
 
				-        parsed_chapters = self._parse_catalog_from_raw_text(raw_text) if isinstance(raw_text, str) else []
			
 
				-        selected_chapters = existing_chapters
			
 
				-
			
 
				-        if parsed_chapters:
			
 
				-            if self._should_prefer_parsed_catalog(parsed_chapters, existing_chapters):
			
 
				-                selected_chapters = parsed_chapters
			
 
				-            elif existing_chapters:
			
 
				-                logger.info(
			
 
				-                    "[PDF提取] raw_ocr_text目录解析结果异常，保留原始目录骨架: "
			
 
				-                    f"parsed={len(parsed_chapters)}, original={len(existing_chapters)}"
			
 
				-                )
			
 
				-            else:
			
 
				-                selected_chapters = parsed_chapters
			
 
				-
			
 
				-        if selected_chapters:
			
 
				-            selected_chapters = self._merge_catalog_chapters(
			
 
				-                selected_chapters,
			
 
				-                parsed_chapters,
			
 
				-            )
			
 
				-            normalized["chapters"] = selected_chapters
			
 
				-            normalized["total_chapters"] = len(selected_chapters)
			
 
				-            normalized["formatted_text"] = self._format_catalog_chapters(selected_chapters)
			
 
				-        return normalized
			
 
				-
			
 
				-    def _parse_catalog_from_raw_text(self, text: str) -> List[Dict[str, Any]]:
			
 
				-        """把目录页 OCR 原文解析成章节树。
			
 
				-
			
 
				-        解析时会先根据首批命中的一级标题推断文档的目录样式，
			
 
				-        后续再尽量沿用同一套规则收敛二级标题，避免不同编号体系互相污染。
			
 
				-        """
			
 
				-        if not text or not text.strip():
			
 
				-            return []
			
 
				-
			
 
				-        chapters: List[Dict[str, Any]] = []
			
 
				-        current_chapter: Optional[Dict[str, Any]] = None
			
 
				-        active_l2_rule: Optional[str] = None
			
 
				-        document_l1_rules: Optional[List[str]] = None
			
 
				-
			
 
				-        for raw_line in self._prepare_catalog_raw_lines(text):
			
 
				-            title_text, page = self._split_catalog_entry(raw_line)
			
 
				-            if not title_text:
			
 
				-                continue
			
 
				-
			
 
				-            compact = re.sub(r"\s+", "", title_text)
			
 
				-            if compact in {"目录", "目錄"}:
			
 
				-                continue
			
 
				-
			
 
				-            chapter_matches = self._matching_rule_names(title_text, "l1", document_l1_rules)
			
 
				-            if chapter_matches:
			
 
				-                if document_l1_rules is None:
			
 
				-                    document_l1_rules = chapter_matches
			
 
				-                current_chapter = {
			
 
				-                    "index": len(chapters) + 1,
			
 
				-                    "title": self._clean_chapter_title(title_text),
			
 
				-                    "page": str(page or 1),
			
 
				-                    "original": raw_line.strip(),
			
 
				-                    "subsections": [],
			
 
				-                }
			
 
				-                chapters.append(current_chapter)
			
 
				-                active_l2_rule = None
			
 
				-                continue
			
 
				-
			
 
				-            if current_chapter is None:
			
 
				-                continue
			
 
				-
			
 
				-            section_matches = self._matching_rule_names(title_text, "l2")
			
 
				-            if not section_matches:
			
 
				-                numeric_section_title = self._coerce_numeric_catalog_section(
			
 
				-                    title_text,
			
 
				-                    document_l1_rules,
			
 
				-                    active_l2_rule,
			
 
				-                )
			
 
				-                if numeric_section_title:
			
 
				-                    section_key = self._normalize_heading_key(numeric_section_title)
			
 
				-                    existing_keys = {
			
 
				-                        self._normalize_heading_key(sub.get("title", ""))
			
 
				-                        for sub in current_chapter.get("subsections", [])
			
 
				-                    }
			
 
				-                    if section_key not in existing_keys:
			
 
				-                        current_chapter["subsections"].append({
			
 
				-                            "title": numeric_section_title,
			
 
				-                            "page": str(page or current_chapter.get("page", 1)),
			
 
				-                            "level": 2,
			
 
				-                            "original": raw_line.strip(),
			
 
				-                        })
			
 
				-                continue
			
 
				-
			
 
				-            if active_l2_rule is None:
			
 
				-                active_l2_rule = section_matches[0]
			
 
				-            if active_l2_rule not in section_matches:
			
 
				-                continue
			
 
				-
			
 
				-            section_title = self._clean_section_title(title_text)
			
 
				-            section_key = self._normalize_heading_key(section_title)
			
 
				-            existing_keys = {
			
 
				-                self._normalize_heading_key(sub.get("title", ""))
			
 
				-                for sub in current_chapter.get("subsections", [])
			
 
				-            }
			
 
				-            if section_key in existing_keys:
			
 
				-                continue
			
 
				-
			
 
				-            current_chapter["subsections"].append({
			
 
				-                "title": section_title,
			
 
				-                "page": str(page or current_chapter.get("page", 1)),
			
 
				-                "level": 2,
			
 
				-                "original": raw_line.strip(),
			
 
				-            })
			
 
				-
			
 
				-        return chapters
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _sanitize_catalog_chapters(cls, chapters: Any) -> List[Dict[str, Any]]:
			
 
				-        if not isinstance(chapters, list):
			
 
				-            return []
			
 
				-
			
 
				-        sanitized: List[Dict[str, Any]] = []
			
 
				-        seen_chapter_keys: Set[str] = set()
			
 
				-
			
 
				-        for idx, chapter in enumerate(chapters, 1):
			
 
				-            if not isinstance(chapter, dict):
			
 
				-                continue
			
 
				-
			
 
				-            chapter_title = cls._clean_chapter_title(str(chapter.get("title", "") or ""))
			
 
				-            chapter_key = cls._normalize_heading_key(chapter_title)
			
 
				-            if not chapter_key or chapter_key in seen_chapter_keys:
			
 
				-                continue
			
 
				-
			
 
				-            seen_chapter_keys.add(chapter_key)
			
 
				-            chapter_page = str(chapter.get("page") or idx)
			
 
				-            subsections: List[Dict[str, Any]] = []
			
 
				-            seen_section_keys: Set[str] = set()
			
 
				-
			
 
				-            for subsection in chapter.get("subsections", []) or []:
			
 
				-                if not isinstance(subsection, dict):
			
 
				-                    continue
			
 
				-
			
 
				-                section_title = cls._clean_section_title(str(subsection.get("title", "") or ""))
			
 
				-                section_key = cls._normalize_heading_key(section_title)
			
 
				-                if not section_key or section_key in seen_section_keys:
			
 
				-                    continue
			
 
				-
			
 
				-                seen_section_keys.add(section_key)
			
 
				-                subsections.append({
			
 
				-                    "title": section_title,
			
 
				-                    "page": str(subsection.get("page") or chapter_page),
			
 
				-                    "level": 2,
			
 
				-                    "original": subsection.get("original", "") or section_title,
			
 
				-                })
			
 
				-
			
 
				-            sanitized.append({
			
 
				-                "index": len(sanitized) + 1,
			
 
				-                "title": chapter_title,
			
 
				-                "page": chapter_page,
			
 
				-                "original": chapter.get("original", "") or chapter_title,
			
 
				-                "subsections": subsections,
			
 
				-            })
			
 
				-
			
 
				-        return sanitized
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _prepare_catalog_raw_lines(cls, text: str) -> List[str]:
			
 
				-        raw_lines = [line.strip() for line in text.splitlines() if line.strip()]
			
 
				-        prepared: List[str] = []
			
 
				-        index = 0
			
 
				-
			
 
				-        while index < len(raw_lines):
			
 
				-            current = raw_lines[index].strip()
			
 
				-            compact_current = re.sub(r"\s+", "", current)
			
 
				-
			
 
				-            if compact_current in {"目", "錄", "录"} and index + 1 < len(raw_lines):
			
 
				-                next_compact = re.sub(r"\s+", "", raw_lines[index + 1].strip())
			
 
				-                if compact_current + next_compact in {"目录", "目錄"}:
			
 
				-                    prepared.append(compact_current + next_compact)
			
 
				-                    index += 2
			
 
				-                    continue
			
 
				-
			
 
				-            if cls._is_incomplete_heading_fragment(current) and index + 1 < len(raw_lines):
			
 
				-                next_line = raw_lines[index + 1].strip()
			
 
				-                candidate = f"{current} {next_line}".strip()
			
 
				-                _, candidate_page = cls._split_catalog_entry(candidate)
			
 
				-                if (
			
 
				-                    cls._matching_rule_names(candidate, "l1")
			
 
				-                    or cls._matching_rule_names(candidate, "l2")
			
 
				-                    or candidate_page is not None
			
 
				-                ):
			
 
				-                    prepared.append(candidate)
			
 
				-                    index += 2
			
 
				-                    continue
			
 
				-
			
 
				-            prepared.append(current)
			
 
				-            index += 1
			
 
				-
			
 
				-        return prepared
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _should_prefer_parsed_catalog(
			
 
				-        cls,
			
 
				-        parsed_chapters: List[Dict[str, Any]],
			
 
				-        existing_chapters: List[Dict[str, Any]],
			
 
				-    ) -> bool:
			
 
				-        if not parsed_chapters:
			
 
				-            return False
			
 
				-
			
 
				-        parsed_is_suspicious = cls._catalog_has_suspicious_structure(parsed_chapters)
			
 
				-        existing_is_suspicious = cls._catalog_has_suspicious_structure(existing_chapters)
			
 
				-
			
 
				-        if parsed_is_suspicious:
			
 
				-            if not existing_chapters or not existing_is_suspicious:
			
 
				-                return False
			
 
				-
			
 
				-            parsed_score = cls._catalog_structure_score(parsed_chapters)
			
 
				-            existing_score = cls._catalog_structure_score(existing_chapters)
			
 
				-            overlap_ratio = cls._catalog_chapter_overlap_ratio(parsed_chapters, existing_chapters)
			
 
				-            return overlap_ratio >= 0.6 and parsed_score > existing_score
			
 
				-
			
 
				-        if not existing_chapters:
			
 
				-            return True
			
 
				-
			
 
				-        if existing_is_suspicious:
			
 
				-            return True
			
 
				-
			
 
				-        if cls._should_prefer_single_level_parsed_catalog(parsed_chapters, existing_chapters):
			
 
				-            return True
			
 
				-
			
 
				-        parsed_score = cls._catalog_structure_score(parsed_chapters)
			
 
				-        existing_score = cls._catalog_structure_score(existing_chapters)
			
 
				-        if parsed_score <= existing_score:
			
 
				-            return False
			
 
				-
			
 
				-        if not cls._catalog_has_suspicious_structure(existing_chapters):
			
 
				-            existing_count = len(existing_chapters)
			
 
				-            parsed_count = len(parsed_chapters)
			
 
				-            if parsed_count > max(existing_count * 2, existing_count + 8):
			
 
				-                return False
			
 
				-            if existing_count >= 4 and parsed_count < max(2, existing_count // 2):
			
 
				-                return False
			
 
				-
			
 
				-        return True
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _should_prefer_single_level_parsed_catalog(
			
 
				-        cls,
			
 
				-        parsed_chapters: List[Dict[str, Any]],
			
 
				-        existing_chapters: List[Dict[str, Any]],
			
 
				-    ) -> bool:
			
 
				-        """特判“单层目录被误识别成一章多节”的场景。"""
			
 
				-        if len(parsed_chapters) < 2 or len(existing_chapters) != 1:
			
 
				-            return False
			
 
				-
			
 
				-        if any(chapter.get("subsections") for chapter in parsed_chapters):
			
 
				-            return False
			
 
				-
			
 
				-        existing_subsections = existing_chapters[0].get("subsections", []) or []
			
 
				-        if len(existing_subsections) < len(parsed_chapters) - 1:
			
 
				-            return False
			
 
				-
			
 
				-        parsed_pages = [
			
 
				-            cls._safe_page_number(chapter.get("page"), 1)
			
 
				-            for chapter in parsed_chapters
			
 
				-        ]
			
 
				-        return parsed_pages == sorted(parsed_pages)
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _catalog_has_suspicious_structure(cls, chapters: List[Dict[str, Any]]) -> bool:
			
 
				-        if not chapters:
			
 
				-            return False
			
 
				-
			
 
				-        titles = [(chapter.get("title", "") or "").strip() for chapter in chapters]
			
 
				-        chinese_chapter_count = sum(
			
 
				-            1 for title in titles
			
 
				-            if re.match(r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]", title)
			
 
				-        )
			
 
				-        numeric_heading_count = sum(
			
 
				-            1 for title in titles
			
 
				-            if re.match(r"^\d{1,2}(?:[\.．。、])?\s+\S+", title)
			
 
				-        )
			
 
				-        embedded_numeric_body_count = 0
			
 
				-        repeated_chapter_no_count = 0
			
 
				-        reversed_chapter_no_count = 0
			
 
				-        seen_chapter_numbers: Set[str] = set()
			
 
				-        previous_numeric_chapter_no: Optional[int] = None
			
 
				-
			
 
				-        for title in titles:
			
 
				-            chapter_match = re.match(
			
 
				-                r"^第\s*(\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]\s*(.*)$",
			
 
				-                title,
			
 
				-            )
			
 
				-            if not chapter_match:
			
 
				-                continue
			
 
				-
			
 
				-            chapter_no = re.sub(r"\s+", "", chapter_match.group(1))
			
 
				-            chapter_body = (chapter_match.group(2) or "").strip()
			
 
				-            if chapter_no in seen_chapter_numbers:
			
 
				-                repeated_chapter_no_count += 1
			
 
				-            seen_chapter_numbers.add(chapter_no)
			
 
				-
			
 
				-            if chapter_no.isdigit():
			
 
				-                current_numeric_no = int(chapter_no)
			
 
				-                if previous_numeric_chapter_no is not None and current_numeric_no < previous_numeric_chapter_no:
			
 
				-                    reversed_chapter_no_count += 1
			
 
				-                previous_numeric_chapter_no = current_numeric_no
			
 
				-
			
 
				-            if re.match(r"^\d{1,2}(?:\.\d{1,2})*\.?(?:\s+|$)", chapter_body):
			
 
				-                embedded_numeric_body_count += 1
			
 
				-
			
 
				-        if chinese_chapter_count >= 2 and numeric_heading_count >= max(3, chinese_chapter_count // 2):
			
 
				-            return True
			
 
				-
			
 
				-        if chinese_chapter_count >= max(2, len(titles) // 3) and numeric_heading_count >= max(2, len(titles) // 6):
			
 
				-            return True
			
 
				-
			
 
				-        if embedded_numeric_body_count >= max(2, len(titles) // 5):
			
 
				-            return True
			
 
				-
			
 
				-        if repeated_chapter_no_count > 0 or reversed_chapter_no_count > 0:
			
 
				-            return True
			
 
				-
			
 
				-        return False
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _catalog_structure_score(chapters: List[Dict[str, Any]]) -> int:
			
 
				-        score = 0
			
 
				-        for chapter in chapters:
			
 
				-            score += 1
			
 
				-            score += len(chapter.get("subsections", []) or [])
			
 
				-        return score
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _catalog_chapter_overlap_ratio(
			
 
				-        cls,
			
 
				-        chapters_a: List[Dict[str, Any]],
			
 
				-        chapters_b: List[Dict[str, Any]],
			
 
				-    ) -> float:
			
 
				-        if not chapters_a or not chapters_b:
			
 
				-            return 0.0
			
 
				-
			
 
				-        keys_a = {
			
 
				-            cls._catalog_chapter_identity_key(chapter.get("title", ""))
			
 
				-            for chapter in chapters_a
			
 
				-            if chapter.get("title")
			
 
				-        }
			
 
				-        keys_b = {
			
 
				-            cls._catalog_chapter_identity_key(chapter.get("title", ""))
			
 
				-            for chapter in chapters_b
			
 
				-            if chapter.get("title")
			
 
				-        }
			
 
				-        if not keys_a or not keys_b:
			
 
				-            return 0.0
			
 
				-
			
 
				-        return len(keys_a & keys_b) / max(1, min(len(keys_a), len(keys_b)))
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _catalog_chapter_identity_key(cls, title: str) -> str:
			
 
				-        cleaned = cls._clean_chapter_title(title)
			
 
				-        if not cleaned:
			
 
				-            return ""
			
 
				-
			
 
				-        chapter_match = re.match(
			
 
				-            r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部分篇]\s*(.*)$",
			
 
				-            cleaned,
			
 
				-        )
			
 
				-        if chapter_match:
			
 
				-            chapter_body = cls._normalize_heading_key(chapter_match.group(1))
			
 
				-            if chapter_body:
			
 
				-                return chapter_body
			
 
				-
			
 
				-        numeric_match = re.match(r"^\d{1,2}(?:[\.．。、])?\s*(.*)$", cleaned)
			
 
				-        if numeric_match:
			
 
				-            numeric_body = cls._normalize_heading_key(numeric_match.group(1))
			
 
				-            if numeric_body:
			
 
				-                return numeric_body
			
 
				-
			
 
				-        return cls._normalize_heading_key(cleaned)
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _merge_catalog_chapters(
			
 
				-        cls,
			
 
				-        base_chapters: List[Dict[str, Any]],
			
 
				-        supplemental_chapters: List[Dict[str, Any]],
			
 
				-    ) -> List[Dict[str, Any]]:
			
 
				-        if not base_chapters:
			
 
				-            return supplemental_chapters or []
			
 
				-        if not supplemental_chapters:
			
 
				-            return base_chapters
			
 
				-
			
 
				-        merged: List[Dict[str, Any]] = []
			
 
				-        supplemental_by_key = {
			
 
				-            cls._catalog_chapter_identity_key(chapter.get("title", "")): chapter
			
 
				-            for chapter in supplemental_chapters
			
 
				-            if chapter.get("title")
			
 
				-        }
			
 
				-
			
 
				-        for index, chapter in enumerate(base_chapters, 1):
			
 
				-            chapter_copy = {
			
 
				-                **chapter,
			
 
				-                "subsections": [dict(sub) for sub in chapter.get("subsections", []) or []],
			
 
				-            }
			
 
				-            chapter_key = cls._catalog_chapter_identity_key(chapter_copy.get("title", ""))
			
 
				-            supplemental = supplemental_by_key.get(chapter_key)
			
 
				-            if supplemental:
			
 
				-                merged_subsections = cls._merge_catalog_subsections(
			
 
				-                    chapter_copy.get("subsections", []),
			
 
				-                    supplemental.get("subsections", []) or [],
			
 
				-                )
			
 
				-                chapter_copy["subsections"] = merged_subsections
			
 
				-            chapter_copy["index"] = index
			
 
				-            merged.append(chapter_copy)
			
 
				-
			
 
				-        return merged
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _merge_catalog_subsections(
			
 
				-        cls,
			
 
				-        base_subsections: List[Dict[str, Any]],
			
 
				-        supplemental_subsections: List[Dict[str, Any]],
			
 
				-    ) -> List[Dict[str, Any]]:
			
 
				-        if not base_subsections:
			
 
				-            return [dict(sub) for sub in supplemental_subsections]
			
 
				-        if not supplemental_subsections:
			
 
				-            return [dict(sub) for sub in base_subsections]
			
 
				-
			
 
				-        def _subsection_score(items: List[Dict[str, Any]]) -> int:
			
 
				-            score = 0
			
 
				-            for item in items:
			
 
				-                title = (item.get("title", "") or "").strip()
			
 
				-                if not title:
			
 
				-                    continue
			
 
				-                score += 1
			
 
				-                if re.match(r"^\d+\.\d+(?!\.\d)\.?\s*", title):
			
 
				-                    score += 3
			
 
				-                elif re.match(r"^(第\s*[一二三四五六七八九十百零两]+\s*节)", title):
			
 
				-                    score += 3
			
 
				-                elif re.match(r"^([一二三四五六七八九十百零两]+[、）\)\]])", title):
			
 
				-                    score += 3
			
 
				-                elif re.match(r"^[【\[]\s*\d+\s*[\]】]", title):
			
 
				-                    score += 3
			
 
				-                elif re.match(r"^\d{1,2}[\.．。、]\s*", title):
			
 
				-                    score += 1
			
 
				-            return score
			
 
				-
			
 
				-        base_score = _subsection_score(base_subsections)
			
 
				-        supplemental_score = _subsection_score(supplemental_subsections)
			
 
				-        if supplemental_score > base_score:
			
 
				-            return [dict(sub) for sub in supplemental_subsections]
			
 
				-
			
 
				-        merged = [dict(sub) for sub in base_subsections]
			
 
				-        seen_keys = {
			
 
				-            cls._normalize_heading_key(sub.get("title", ""))
			
 
				-            for sub in merged
			
 
				-            if sub.get("title")
			
 
				-        }
			
 
				-        for subsection in supplemental_subsections:
			
 
				-            subsection_key = cls._normalize_heading_key(subsection.get("title", ""))
			
 
				-            if not subsection_key or subsection_key in seen_keys:
			
 
				-                continue
			
 
				-            merged.append(dict(subsection))
			
 
				-            seen_keys.add(subsection_key)
			
 
				-        return merged
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _coerce_numeric_catalog_section(
			
 
				-        cls,
			
 
				-        title_text: str,
			
 
				-        document_l1_rules: Optional[List[str]],
			
 
				-        active_l2_rule: Optional[str],
			
 
				-    ) -> Optional[str]:
			
 
				-        if active_l2_rule is not None:
			
 
				-            return None
			
 
				-
			
 
				-        if not document_l1_rules:
			
 
				-            return None
			
 
				-
			
 
				-        if "Rule_1_纯数字派" in document_l1_rules:
			
 
				-            return None
			
 
				-
			
 
				-        if re.match(r"^\d{1,2}(?:[\.．。、])?\s*(?!\d)[\u4e00-\u9fa5A-Za-z].*", title_text.strip()):
			
 
				-            return cls._clean_section_title(title_text)
			
 
				-
			
 
				-        return None
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _split_catalog_entry(line: str) -> Tuple[str, Optional[int]]:
			
 
				-        cleaned = line.strip()
			
 
				-        if not cleaned:
			
 
				-            return "", None
			
 
				-
			
 
				-        cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				-        page_match = re.search(
			
 
				-            r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*(\d+)\s*[-\u2013\u2014 ]*$",
			
 
				-            cleaned,
			
 
				-        )
			
 
				-        if page_match:
			
 
				-            title_text = cleaned[:page_match.start()].strip()
			
 
				-            title_text = re.sub(r"[.\u2026\u00b7\u2022 ]+$", "", title_text).strip()
			
 
				-            return title_text, int(page_match.group(1))
			
 
				-
			
 
				-        return cleaned, None
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _format_catalog_chapters(chapters: List[Dict[str, Any]]) -> str:
			
 
				-        lines: List[str] = []
			
 
				-        for chapter in chapters:
			
 
				-            title = chapter.get("title", "").strip()
			
 
				-            if not title:
			
 
				-                continue
			
 
				-            lines.append(title)
			
 
				-            for sub in chapter.get("subsections", []):
			
 
				-                sub_title = sub.get("title", "").strip()
			
 
				-                if sub_title:
			
 
				-                    lines.append(f"  {sub_title}")
			
 
				-        return "\n".join(lines)
			
 
				-
			
 
				-    def _enrich_catalog_with_structure(
			
 
				-        self,
			
 
				-        catalog: Dict[str, Any],
			
 
				-        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				-    ) -> Dict[str, Any]:
			
 
				-        catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
			
 
				-        if not catalog_chapters or not chapters:
			
 
				-            return catalog
			
 
				-
			
 
				-        enriched = dict(catalog)
			
 
				-        structure_items = list(chapters.items())
			
 
				-        structure_by_key = {
			
 
				-            self._catalog_chapter_identity_key(chapter_title): (chapter_title, sections)
			
 
				-            for chapter_title, sections in structure_items
			
 
				-        }
			
 
				-        used_structure_keys: Set[str] = set()
			
 
				-
			
 
				-        enriched_chapters: List[Dict[str, Any]] = []
			
 
				-        for catalog_chapter in catalog_chapters:
			
 
				-            chapter_copy = dict(catalog_chapter)
			
 
				-            chapter_key = self._catalog_chapter_identity_key(chapter_copy.get("title", ""))
			
 
				-            structure_match = structure_by_key.get(chapter_key)
			
 
				-            if structure_match is None:
			
 
				-                enriched_chapters.append(chapter_copy)
			
 
				-                continue
			
 
				-
			
 
				-            structure_title, structure_sections = structure_match
			
 
				-            used_structure_keys.add(chapter_key)
			
 
				-            title_payload = structure_sections.get("章节标题", {})
			
 
				-            chapter_copy["title"] = structure_title
			
 
				-            chapter_copy["content"] = title_payload.get("content", "")
			
 
				-            chapter_copy["page_start"] = title_payload.get("page_start", self._safe_page_number(chapter_copy.get("page")))
			
 
				-            chapter_copy["page_end"] = title_payload.get("page_end", chapter_copy["page_start"])
			
 
				-
			
 
				-            structure_subsections = [
			
 
				-                (section_title, payload)
			
 
				-                for section_title, payload in structure_sections.items()
			
 
				-                if section_title != "章节标题"
			
 
				-            ]
			
 
				-            catalog_subsections = chapter_copy.get("subsections", []) or []
			
 
				-            subsection_by_key = {
			
 
				-                self._normalize_heading_key(subsection.get("title", "")): subsection
			
 
				-                for subsection in catalog_subsections
			
 
				-                if subsection.get("title")
			
 
				-            }
			
 
				-
			
 
				-            enriched_subsections: List[Dict[str, Any]] = []
			
 
				-            for section_title, payload in structure_subsections:
			
 
				-                section_key = self._normalize_heading_key(section_title)
			
 
				-                subsection = dict(subsection_by_key.get(section_key, {}))
			
 
				-                subsection.setdefault("title", section_title)
			
 
				-                subsection.setdefault("page", str(payload.get("page_start", chapter_copy["page_start"])))
			
 
				-                subsection.setdefault("level", 2)
			
 
				-                subsection.setdefault("original", section_title)
			
 
				-                subsection["content"] = payload.get("content", "")
			
 
				-                subsection["page_start"] = payload.get("page_start", chapter_copy["page_start"])
			
 
				-                subsection["page_end"] = payload.get("page_end", subsection["page_start"])
			
 
				-                enriched_subsections.append(subsection)
			
 
				-
			
 
				-            chapter_copy["subsections"] = enriched_subsections
			
 
				-            enriched_chapters.append(chapter_copy)
			
 
				-
			
 
				-        existing_catalog_keys = {
			
 
				-            self._catalog_chapter_identity_key(chapter.get("title", ""))
			
 
				-            for chapter in enriched_chapters
			
 
				-            if chapter.get("title")
			
 
				-        }
			
 
				-        for chapter_title, structure_sections in structure_items:
			
 
				-            chapter_key = self._catalog_chapter_identity_key(chapter_title)
			
 
				-            if chapter_key in existing_catalog_keys or chapter_key in used_structure_keys:
			
 
				-                continue
			
 
				-
			
 
				-            title_payload = structure_sections.get("章节标题", {})
			
 
				-            new_chapter = {
			
 
				-                "index": len(enriched_chapters) + 1,
			
 
				-                "title": chapter_title,
			
 
				-                "page": str(title_payload.get("page_start", 1)),
			
 
				-                "original": chapter_title,
			
 
				-                "content": title_payload.get("content", ""),
			
 
				-                "page_start": title_payload.get("page_start", 1),
			
 
				-                "page_end": title_payload.get("page_end", title_payload.get("page_start", 1)),
			
 
				-                "subsections": [],
			
 
				-            }
			
 
				-            for section_title, payload in structure_sections.items():
			
 
				-                if section_title == "章节标题":
			
 
				-                    continue
			
 
				-                new_chapter["subsections"].append({
			
 
				-                    "title": section_title,
			
 
				-                    "page": str(payload.get("page_start", new_chapter["page_start"])),
			
 
				-                    "level": 2,
			
 
				-                    "original": section_title,
			
 
				-                    "content": payload.get("content", ""),
			
 
				-                    "page_start": payload.get("page_start", new_chapter["page_start"]),
			
 
				-                    "page_end": payload.get("page_end", payload.get("page_start", new_chapter["page_start"])),
			
 
				-                })
			
 
				-            enriched_chapters.append(new_chapter)
			
 
				-
			
 
				-        for index, chapter in enumerate(enriched_chapters, 1):
			
 
				-            chapter["index"] = index
			
 
				-
			
 
				-        enriched["chapters"] = enriched_chapters
			
 
				-        enriched["total_chapters"] = len(enriched_chapters)
			
 
				-        enriched["formatted_text"] = self._format_catalog_chapters(enriched_chapters)
			
 
				-        return enriched
			
 
				-
			
 
				-    def _reconcile_structure_with_catalog(
			
 
				-        self,
			
 
				-        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				-        catalog: Dict[str, Any],
			
 
				-    ) -> Dict[str, Dict[str, Dict[str, Any]]]:
			
 
				-        """把正文抽取结果挂回目录骨架。
			
 
				-
			
 
				-        正文抽取结果通常 content 更完整，但层级可能漏掉；
			
 
				-        目录结果层级更稳，但 content 为空或不完整。
			
 
				-        这里按标题归一化后顺序匹配，把正文内容重新映射回目录结构。
			
 
				-        """
			
 
				-        catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
			
 
				-        if not chapters or not catalog_chapters:
			
 
				-            return chapters
			
 
				-
			
 
				-        section_title_key = "章节标题"
			
 
				-        # 将正文结构拆成“章标题内容”和“所有节标题内容”两条索引，方便后续按目录顺序逐项匹配。
			
 
				-        chapter_title_payloads: Dict[str, List[Dict[str, Any]]] = {}
			
 
				-        flat_sections: List[Tuple[str, Dict[str, Any]]] = []
			
 
				-        matched_chapter_count = 0
			
 
				-        matched_section_count = 0
			
 
				-        total_catalog_sections = 0
			
 
				-
			
 
				-        for chapter_title, sections in chapters.items():
			
 
				-            title_key = self._normalize_heading_key(chapter_title)
			
 
				-            title_payload = sections.get(section_title_key)
			
 
				-            if title_payload is not None:
			
 
				-                chapter_title_payloads.setdefault(title_key, []).append({
			
 
				-                    "content": title_payload.get("content", ""),
			
 
				-                    "page_start": title_payload.get("page_start", 1),
			
 
				-                    "page_end": title_payload.get("page_end", title_payload.get("page_start", 1)),
			
 
				-                })
			
 
				-
			
 
				-            for section_title, payload in sections.items():
			
 
				-                if section_title == section_title_key:
			
 
				-                    continue
			
 
				-                flat_sections.append((
			
 
				-                    self._normalize_heading_key(section_title),
			
 
				-                    {
			
 
				-                        "content": payload.get("content", ""),
			
 
				-                        "page_start": payload.get("page_start", 1),
			
 
				-                        "page_end": payload.get("page_end", payload.get("page_start", 1)),
			
 
				-                    },
			
 
				-                ))
			
 
				-
			
 
				-        rebuilt: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				-        # 优先按顺序向后匹配，找不到时再全局回退一次，兼顾正确率和容错性。
			
 
				-        search_start = 0
			
 
				-        used_indices = set()
			
 
				-
			
 
				-        for chapter in catalog_chapters:
			
 
				-            chapter_title = (chapter.get("title", "") or "").strip()
			
 
				-            if not chapter_title:
			
 
				-                continue
			
 
				-
			
 
				-            chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				-            chapter_key = self._normalize_heading_key(chapter_title)
			
 
				-            title_candidates = chapter_title_payloads.get(chapter_key, [])
			
 
				-            has_title_match = bool(title_candidates)
			
 
				-            title_payload = title_candidates.pop(0) if title_candidates else self._empty_section_payload(chapter_page)
			
 
				-            if has_title_match:
			
 
				-                matched_chapter_count += 1
			
 
				-
			
 
				-            rebuilt[chapter_title] = {
			
 
				-                section_title_key: title_payload,
			
 
				-            }
			
 
				-
			
 
				-            for subsection in chapter.get("subsections", []):
			
 
				-                section_title = (subsection.get("title", "") or "").strip()
			
 
				-                if not section_title:
			
 
				-                    continue
			
 
				-                total_catalog_sections += 1
			
 
				-
			
 
				-                target_key = self._normalize_heading_key(section_title)
			
 
				-                match_index = None
			
 
				-                for idx in range(search_start, len(flat_sections)):
			
 
				-                    if idx in used_indices:
			
 
				-                        continue
			
 
				-                    if flat_sections[idx][0] == target_key:
			
 
				-                        match_index = idx
			
 
				-                        break
			
 
				-                if match_index is None:
			
 
				-                    for idx, (section_key, _) in enumerate(flat_sections):
			
 
				-                        if idx in used_indices:
			
 
				-                            continue
			
 
				-                        if section_key == target_key:
			
 
				-                            match_index = idx
			
 
				-                            break
			
 
				-
			
 
				-                if match_index is not None:
			
 
				-                    used_indices.add(match_index)
			
 
				-                    search_start = max(search_start, match_index + 1)
			
 
				-                    rebuilt[chapter_title][section_title] = flat_sections[match_index][1]
			
 
				-                    matched_section_count += 1
			
 
				-                else:
			
 
				-                    rebuilt[chapter_title][section_title] = self._empty_section_payload(
			
 
				-                        self._safe_page_number(subsection.get("page"), chapter_page)
			
 
				-                    )
			
 
				-
			
 
				-        if total_catalog_sections > 0 and matched_section_count == 0:
			
 
				-            return chapters
			
 
				-
			
 
				-        if matched_chapter_count == 0 and matched_section_count == 0:
			
 
				-            return chapters
			
 
				-
			
 
				-        return rebuilt or chapters
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _normalize_heading_key(text: str) -> str:
			
 
				-        normalized = PdfStructureExtractor._strip_catalog_page_suffix((text or "").strip())
			
 
				-        normalized = normalized.replace("【", "[").replace("】", "]")
			
 
				-        normalized = normalized.replace("（", "(").replace("）", ")")
			
 
				-        normalized = normalized.replace("．", ".").replace("。", ".")
			
 
				-        normalized = re.sub(r"\s+", "", normalized)
			
 
				-        return normalized
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _safe_page_number(value: Any, default: int = 1) -> int:
			
 
				-        try:
			
 
				-            return max(1, int(str(value).strip()))
			
 
				-        except Exception:
			
 
				-            return default
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _empty_section_payload(page_num: int) -> Dict[str, Any]:
			
 
				-        return {
			
 
				-            "content": "",
			
 
				-            "page_start": page_num,
			
 
				-            "page_end": page_num,
			
 
				-        }
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _prepare_page_lines(cls, text: str) -> List[str]:
			
 
				-        """清洗页面文本行，并尝试把被换行拆开的标题重新合并。"""
			
 
				-        raw_lines = [line.strip() for line in text.split("\n") if line.strip()]
			
 
				-        prepared_lines: List[str] = []
			
 
				-        index = 0
			
 
				-
			
 
				-        while index < len(raw_lines):
			
 
				-            merged_line, consumed = cls._merge_heading_fragment(raw_lines, index)
			
 
				-            if merged_line:
			
 
				-                prepared_lines.append(merged_line)
			
 
				-                index += consumed
			
 
				-                continue
			
 
				-
			
 
				-            prepared_lines.append(raw_lines[index])
			
 
				-            index += 1
			
 
				-
			
 
				-        return prepared_lines
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _merge_heading_fragment(
			
 
				-        cls,
			
 
				-        lines: List[str],
			
 
				-        start_index: int,
			
 
				-    ) -> Tuple[Optional[str], int]:
			
 
				-        """尝试把当前位置开始的 2~3 行拼成完整标题。"""
			
 
				-        first_line = lines[start_index].strip()
			
 
				-        if not first_line:
			
 
				-            return None, 1
			
 
				-
			
 
				-        first_is_heading = bool(cls._matching_rule_names(first_line, "l1") or cls._matching_rule_names(first_line, "l2"))
			
 
				-        first_is_incomplete = cls._is_incomplete_heading_fragment(first_line)
			
 
				-        max_span = min(3, len(lines) - start_index)
			
 
				-
			
 
				-        for span in range(2, max_span + 1):
			
 
				-            candidate_lines = [lines[start_index + offset].strip() for offset in range(span)]
			
 
				-            candidate_text = " ".join(candidate_lines).strip()
			
 
				-            if not candidate_text or cls.TOC_PATTERN.search(candidate_text):
			
 
				-                continue
			
 
				-            if not (cls._matching_rule_names(candidate_text, "l1") or cls._matching_rule_names(candidate_text, "l2")):
			
 
				-                continue
			
 
				-            # 只有首行本身像“半截标题”，或者合并后明显更像标题时才吞并后续行，避免误吃正文。
			
 
				-            if first_is_incomplete or not first_is_heading:
			
 
				-                return candidate_text, span
			
 
				-
			
 
				-        return None, 1
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _is_incomplete_heading_fragment(line: str) -> bool:
			
 
				-        clean_line = re.sub(r"\s+", "", line.strip())
			
 
				-        if not clean_line:
			
 
				-            return False
			
 
				-
			
 
				-        fragment_patterns = (
			
 
				-            r"^第(?:\d+|[一二三四五六七八九十百零两]+)[章部分篇]$",
			
 
				-            r"^\d{1,2}(?:[\.．。、])$",
			
 
				-            r"^[【\[]\d+[\]】]$",
			
 
				-            r"^[一二三四五六七八九十百零两]+[、）\)\]]$",
			
 
				-            r"^第[一二三四五六七八九十百零两]+节$",
			
 
				-            r"^\d+\.\d+(?!\.\d)\.?$",
			
 
				-        )
			
 
				-        return any(re.match(pattern, clean_line) for pattern in fragment_patterns)
			
 
				-
			
 
				-    def _rebuild_section_contents_from_catalog(
			
 
				-        self,
			
 
				-        chapters: Dict[str, Dict[str, Dict[str, Any]]],
			
 
				-        catalog: Dict[str, Any],
			
 
				-        body_lines: List[Dict[str, Any]],
			
 
				-    ) -> Dict[str, Dict[str, Dict[str, Any]]]:
			
 
				-        """基于目录顺序和正文行号，重新切分 section content。
			
 
				-
			
 
				-        当正文结构抽取漏掉部分标题时，直接使用结构化结果容易出现 content 缺段。
			
 
				-        这里把目录拍平成一条标题时间线，再在线性正文里定位这些标题，
			
 
				-        用“当前标题到下一个标题”之间的文本作为当前 section 的正文。
			
 
				-        """
			
 
				-        catalog_chapters = catalog.get("chapters", []) if isinstance(catalog, dict) else []
			
 
				-        if not catalog_chapters or not body_lines:
			
 
				-            return chapters
			
 
				-
			
 
				-        # 先把目录展开成顺序列表，方便统一定位每个标题在正文中的起点。
			
 
				-        expected_items: List[Dict[str, Any]] = []
			
 
				-        total_sections = 0
			
 
				-        for chapter in catalog_chapters:
			
 
				-            chapter_title = (chapter.get("title", "") or "").strip()
			
 
				-            if not chapter_title:
			
 
				-                continue
			
 
				-            chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				-            expected_items.append({
			
 
				-                "kind": "chapter",
			
 
				-                "title": chapter_title,
			
 
				-                "chapter_title": chapter_title,
			
 
				-                "section_title": "章节标题",
			
 
				-                "page_hint": chapter_page,
			
 
				-                "line_index": None,
			
 
				-                "page": chapter_page,
			
 
				-            })
			
 
				-
			
 
				-            for subsection in chapter.get("subsections", []):
			
 
				-                section_title = (subsection.get("title", "") or "").strip()
			
 
				-                if not section_title:
			
 
				-                    continue
			
 
				-                total_sections += 1
			
 
				-                expected_items.append({
			
 
				-                    "kind": "section",
			
 
				-                    "title": section_title,
			
 
				-                    "chapter_title": chapter_title,
			
 
				-                    "section_title": section_title,
			
 
				-                    "page_hint": self._safe_page_number(subsection.get("page"), chapter_page),
			
 
				-                    "line_index": None,
			
 
				-                    "page": self._safe_page_number(subsection.get("page"), chapter_page),
			
 
				-                })
			
 
				-
			
 
				-        if not expected_items or total_sections == 0:
			
 
				-            return chapters
			
 
				-
			
 
				-        search_start = 0
			
 
				-        found_sections = 0
			
 
				-        for item in expected_items:
			
 
				-            line_index = self._find_heading_line_index(
			
 
				-                body_lines,
			
 
				-                item["title"],
			
 
				-                item["kind"],
			
 
				-                search_start,
			
 
				-            )
			
 
				-            item["line_index"] = line_index
			
 
				-            if line_index is not None:
			
 
				-                item["page"] = body_lines[line_index]["page"]
			
 
				-                search_start = line_index + 1
			
 
				-                if item["kind"] == "section":
			
 
				-                    found_sections += 1
			
 
				-
			
 
				-        if found_sections == 0:
			
 
				-            return chapters
			
 
				-
			
 
				-        rebuilt: Dict[str, Dict[str, Dict[str, Any]]] = {}
			
 
				-        section_title_key = "章节标题"
			
 
				-
			
 
				-        for chapter in catalog_chapters:
			
 
				-            chapter_title = (chapter.get("title", "") or "").strip()
			
 
				-            if not chapter_title:
			
 
				-                continue
			
 
				-
			
 
				-            chapter_page = self._safe_page_number(chapter.get("page"))
			
 
				-            existing_sections = chapters.get(chapter_title, {})
			
 
				-            rebuilt[chapter_title] = {
			
 
				-                section_title_key: existing_sections.get(section_title_key, self._empty_section_payload(chapter_page))
			
 
				-            }
			
 
				-
			
 
				-            for subsection in chapter.get("subsections", []):
			
 
				-                section_title = (subsection.get("title", "") or "").strip()
			
 
				-                if not section_title:
			
 
				-                    continue
			
 
				-                rebuilt[chapter_title][section_title] = existing_sections.get(
			
 
				-                    section_title,
			
 
				-                    self._empty_section_payload(self._safe_page_number(subsection.get("page"), chapter_page)),
			
 
				-                )
			
 
				-
			
 
				-        for idx, item in enumerate(expected_items):
			
 
				-            if item["kind"] != "section" or item["line_index"] is None:
			
 
				-                continue
			
 
				-
			
 
				-            # 下一个已定位标题就是当前 section 的右边界；没有下一个则取到文末。
			
 
				-            next_heading_index = len(body_lines)
			
 
				-            for later in expected_items[idx + 1:]:
			
 
				-                if later["line_index"] is not None:
			
 
				-                    next_heading_index = later["line_index"]
			
 
				-                    break
			
 
				-
			
 
				-            content_entries = body_lines[item["line_index"] + 1:next_heading_index]
			
 
				-            content_text = "\n".join(entry["text"] for entry in content_entries).strip()
			
 
				-            existing_payload = rebuilt[item["chapter_title"]].get(item["section_title"], {})
			
 
				-
			
 
				-            if not content_text and (existing_payload.get("content") or "").strip():
			
 
				-                continue
			
 
				-
			
 
				-            if content_entries:
			
 
				-                page_start = content_entries[0]["page"]
			
 
				-                page_end = content_entries[-1]["page"]
			
 
				-            else:
			
 
				-                page_start = item["page"]
			
 
				-                page_end = item["page"]
			
 
				-
			
 
				-            rebuilt[item["chapter_title"]][item["section_title"]] = {
			
 
				-                "content": content_text,
			
 
				-                "page_start": page_start,
			
 
				-                "page_end": page_end,
			
 
				-            }
			
 
				-
			
 
				-        return rebuilt or chapters
			
 
				-
			
 
				-    def _find_heading_line_index(
			
 
				-        self,
			
 
				-        body_lines: List[Dict[str, Any]],
			
 
				-        target_title: str,
			
 
				-        heading_kind: str,
			
 
				-        start_index: int,
			
 
				-    ) -> Optional[int]:
			
 
				-        """在线性正文中查找目标标题行。
			
 
				-
			
 
				-        先做归一化后的精确匹配；若 OCR / PDF 抽取给标题前面带了噪声前缀，
			
 
				-        再退一步做“候选行后缀等于目标标题”的宽松匹配。
			
 
				-        """
			
 
				-        target_key = self._normalize_heading_key(target_title)
			
 
				-        if not target_key:
			
 
				-            return None
			
 
				-
			
 
				-        for index in range(start_index, len(body_lines)):
			
 
				-            candidate_text = (body_lines[index].get("text") or "").strip()
			
 
				-            if not candidate_text or self.TOC_PATTERN.search(candidate_text):
			
 
				-                continue
			
 
				-
			
 
				-            if heading_kind == "chapter":
			
 
				-                candidate_key = self._normalize_heading_key(self._clean_chapter_title(candidate_text))
			
 
				-            else:
			
 
				-                candidate_key = self._normalize_heading_key(self._clean_section_title(candidate_text))
			
 
				-
			
 
				-            if candidate_key == target_key:
			
 
				-                return index
			
 
				-
			
 
				-            raw_candidate_key = self._normalize_heading_key(candidate_text)
			
 
				-            # 某些 PDF 会把页码、序号或残余字符拼到标题前面，这里允许有限前缀噪声。
			
 
				-            if raw_candidate_key.endswith(target_key):
			
 
				-                prefix = raw_candidate_key[:-len(target_key)]
			
 
				-                if not prefix or re.fullmatch(
			
 
				-                    r"[\dA-Za-z\.\-_/|,:;()\[\]\u3001\u3002\uff0c\uff1a\uff1b\uff08\uff09\u3010\u3011]+",
			
 
				-                    prefix,
			
 
				-                ):
			
 
				-                    return index
			
 
				-
			
 
				-        return None
			
 
				-
			
 
				-    def _process_ocr_concurrent(self, regions: List[TableRegion], progress_callback=None) -> List[OcrResult]:
			
 
				-        """同步并发处理 OCR（使用 ThreadPoolExecutor）"""
			
 
				-        results: List[OcrResult] = []
			
 
				-        total = len(regions)
			
 
				-        completed = 0
			
 
				-
			
 
				-        with ThreadPoolExecutor(max_workers=self.OCR_CONCURRENT_WORKERS) as executor:
			
 
				-            # 提交所有任务
			
 
				-            future_to_region = {
			
 
				-                executor.submit(self._ocr_table_region, r.page, r.bbox): r
			
 
				-                for r in regions
			
 
				-            }
			
 
				-
			
 
				-            # 处理完成的结果
			
 
				-            for future in as_completed(future_to_region):
			
 
				-                region = future_to_region[future]
			
 
				-                completed += 1
			
 
				-                try:
			
 
				-                    text = future.result()
			
 
				-                    results.append(OcrResult(
			
 
				-                        page_num=region.page_num,
			
 
				-                        bbox=region.bbox,
			
 
				-                        score=region.score,
			
 
				-                        text=text,
			
 
				-                        success=True,
			
 
				-                    ))
			
 
				-                except Exception as e:
			
 
				-                    logger.error(f"  第 {region.page_num} 页表格 OCR 失败: {e}")
			
 
				-                    results.append(OcrResult(
			
 
				-                        page_num=region.page_num,
			
 
				-                        bbox=region.bbox,
			
 
				-                        score=region.score,
			
 
				-                        text="",
			
 
				-                        success=False,
			
 
				-                    ))
			
 
				-
			
 
				-                # 每完成5个或最后一个时推送进度
			
 
				-                if progress_callback and (completed % 5 == 0 or completed == total):
			
 
				-                    progress = 35 + int(completed / total * 15)  # OCR执行占15%进度(35-50)
			
 
				-                    progress_callback("版面分析", progress, f"OCR识别中 {completed}/{total}")
			
 
				-
			
 
				-        return results
			
 
				-
			
 
				-    def _detect_table_regions(
			
 
				-        self,
			
 
				-        page: fitz.Page,
			
 
				-        page_num: int,
			
 
				-        clip_box: fitz.Rect
			
 
				-    ) -> List[Tuple[Tuple[float, float, float, float], float]]:
			
 
				-        """检测页面中的表格区域，返回坐标列表"""
			
 
				-        table_regions: List[Tuple[Tuple[float, float, float, float], float]] = []
			
 
				-
			
 
				-        if not RAPID_LAYOUT_AVAILABLE:
			
 
				-            return table_regions
			
 
				-
			
 
				-        layout_engine = self._get_layout_engine()
			
 
				-        if layout_engine is None:
			
 
				-            return table_regions
			
 
				-
			
 
				-        # 渲染页面（裁剪区域）
			
 
				-        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=clip_box)
			
 
				-        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
			
 
				-
			
 
				-        try:
			
 
				-            layout_output = layout_engine(img)
			
 
				-
			
 
				-            # 解析版面结果
			
 
				-            if hasattr(layout_output, 'boxes') and hasattr(layout_output, 'class_names'):
			
 
				-                # 获取缩放比例
			
 
				-                scale_x = clip_box.width / img.shape[1]
			
 
				-                scale_y = clip_box.height / img.shape[0]
			
 
				-
			
 
				-                for box, label, score in zip(layout_output.boxes, layout_output.class_names, layout_output.scores):
			
 
				-                    if label == "table" and score > self.OCR_CONFIDENCE_THRESHOLD:
			
 
				-                        # 转换为 PDF 坐标
			
 
				-                        pdf_x1 = clip_box.x0 + box[0] * scale_x
			
 
				-                        pdf_y1 = clip_box.y0 + box[1] * scale_y
			
 
				-                        pdf_x2 = clip_box.x0 + box[2] * scale_x
			
 
				-                        pdf_y2 = clip_box.y0 + box[3] * scale_y
			
 
				-
			
 
				-                        table_regions.append(((pdf_x1, pdf_y1, pdf_x2, pdf_y2), score))
			
 
				-
			
 
				-        except Exception as e:
			
 
				-            logger.warning(f"  第 {page_num} 页: 版面分析失败 ({e})")
			
 
				-
			
 
				-        return table_regions
			
 
				-
			
 
				-    def _ocr_table_region(self, page: fitz.Page, bbox: Tuple[float, float, float, float], max_retries: int = 3) -> str:
			
 
				-        """对指定区域进行 OCR 识别（使用 GLM-OCR），支持指数退避重试"""
			
 
				-        import time
			
 
				-
			
 
				-        # 渲染指定区域
			
 
				-        rect = fitz.Rect(bbox)
			
 
				-        pix = page.get_pixmap(dpi=self.OCR_DPI, clip=rect)
			
 
				-        img_bytes = pix.tobytes("jpeg")
			
 
				-
			
 
				-        # 压缩图片
			
 
				-        compressed = self._compress_image(img_bytes)
			
 
				-        img_base64 = base64.b64encode(compressed).decode('utf-8')
			
 
				-
			
 
				-        # 请求 OCR
			
 
				-        payload = {
			
 
				-            "model": "GLM-OCR",
			
 
				-            "messages": [
			
 
				-                {
			
 
				-                    "role": "user",
			
 
				-                    "content": [
			
 
				-                        {
			
 
				-                            "type": "text",
			
 
				-                            "text": "识别图片中的表格内容，按原文排版输出。"
			
 
				-                                    "注意："
			
 
				-                                    "1. 表格用 Markdown 表格格式"
			
 
				-                                    "2. 保持换行和列对齐"
			
 
				-                                    "3. 只输出表格内容，不要其他说明"
			
 
				-                        },
			
 
				-                        {
			
 
				-                            "type": "image_url",
			
 
				-                            "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}
			
 
				-                        }
			
 
				-                    ]
			
 
				-                }
			
 
				-            ],
			
 
				-            "max_tokens": 2048,
			
 
				-            "temperature": 0.1
			
 
				-        }
			
 
				-
			
 
				-        headers = {"Content-Type": "application/json"}
			
 
				-        if self.ocr_api_key:
			
 
				-            headers["Authorization"] = f"Bearer {self.ocr_api_key}"
			
 
				-
			
 
				-        # 指数退避重试
			
 
				-        last_error = None
			
 
				-        for attempt in range(max_retries):
			
 
				-            try:
			
 
				-                response = requests.post(
			
 
				-                    self.ocr_api_url,
			
 
				-                    headers=headers,
			
 
				-                    json=payload,
			
 
				-                    timeout=self.ocr_timeout
			
 
				-                )
			
 
				-                response.raise_for_status()
			
 
				-
			
 
				-                result = response.json()
			
 
				-                return self._extract_ocr_content(result)
			
 
				-
			
 
				-            except Exception as e:
			
 
				-                last_error = e
			
 
				-                if attempt < max_retries - 1:
			
 
				-                    # 指数退避: 2, 4, 8 秒
			
 
				-                    wait_time = 2 ** (attempt + 1)
			
 
				-                    logger.warning(f"  第 {page.number + 1} 页表格 OCR 第 {attempt + 1} 次失败: {e}, {wait_time}秒后重试...")
			
 
				-                    time.sleep(wait_time)
			
 
				-                else:
			
 
				-                    logger.error(f"  第 {page.number + 1} 页表格 OCR 最终失败（已重试{max_retries}次）: {e}")
			
 
				-
			
 
				-        # 所有重试都失败，抛出最后一个错误
			
 
				-        raise last_error
			
 
				-
			
 
				-    def _replace_table_regions(
			
 
				-        self,
			
 
				-        page: fitz.Page,
			
 
				-        original_text: str,
			
 
				-        ocr_results: List[Dict],
			
 
				-        clip_box: fitz.Rect
			
 
				-    ) -> str:
			
 
				-        """用 OCR 结果替换原始文本中的表格区域"""
			
 
				-        if not ocr_results:
			
 
				-            return original_text
			
 
				-
			
 
				-        text_blocks = []
			
 
				-        for block in page.get_text("blocks"):
			
 
				-            x0, y0, x1, y1, text, _, _ = block
			
 
				-            if y0 >= clip_box.y0 and y1 <= clip_box.y1:
			
 
				-                text_blocks.append({
			
 
				-                    "bbox": (x0, y0, x1, y1),
			
 
				-                    "text": text.strip(),
			
 
				-                })
			
 
				-
			
 
				-        text_blocks.sort(key=lambda b: (b["bbox"][1], b["bbox"][0]))
			
 
				-
			
 
				-        if not text_blocks:
			
 
				-            return original_text
			
 
				-
			
 
				-        region_entries: List[Dict[str, Any]] = []
			
 
				-        replaced_indices: Set[int] = set()
			
 
				-
			
 
				-        for ocr_result in sorted(ocr_results, key=lambda r: r["bbox"][1]):
			
 
				-            rx0, ry0, rx1, ry1 = ocr_result["bbox"]
			
 
				-            current_indices: List[int] = []
			
 
				-
			
 
				-            for idx, block in enumerate(text_blocks):
			
 
				-                if idx in replaced_indices:
			
 
				-                    continue
			
 
				-                if self._block_contains_heading(block["text"]):
			
 
				-                    continue
			
 
				-
			
 
				-                bx0, by0, bx1, by1 = block["bbox"]
			
 
				-                overlap_x = max(0, min(bx1, rx1) - max(bx0, rx0))
			
 
				-                overlap_y = max(0, min(by1, ry1) - max(by0, ry0))
			
 
				-                overlap_area = overlap_x * overlap_y
			
 
				-                block_area = max((bx1 - bx0) * (by1 - by0), 1)
			
 
				-
			
 
				-                if overlap_area / block_area > 0.5:
			
 
				-                    current_indices.append(idx)
			
 
				-
			
 
				-            if not current_indices:
			
 
				-                continue
			
 
				-
			
 
				-            replaced_indices.update(current_indices)
			
 
				-            region_entries.append({
			
 
				-                "start": min(current_indices),
			
 
				-                "end": max(current_indices),
			
 
				-                "ocr_text": (ocr_result.get("ocr_text") or "").strip(),
			
 
				-            })
			
 
				-
			
 
				-        if not region_entries:
			
 
				-            return original_text
			
 
				-
			
 
				-        region_by_start = {entry["start"]: entry for entry in region_entries}
			
 
				-        result_parts: List[str] = []
			
 
				-        idx = 0
			
 
				-
			
 
				-        while idx < len(text_blocks):
			
 
				-            region = region_by_start.get(idx)
			
 
				-            if region is not None:
			
 
				-                if region["ocr_text"]:
			
 
				-                    result_parts.append(region["ocr_text"])
			
 
				-                    result_parts.append("\n")
			
 
				-                else:
			
 
				-                    for block_idx in range(region["start"], region["end"] + 1):
			
 
				-                        block_text = text_blocks[block_idx]["text"]
			
 
				-                        if block_text:
			
 
				-                            result_parts.append(block_text)
			
 
				-                            result_parts.append("\n")
			
 
				-                idx = region["end"] + 1
			
 
				-                continue
			
 
				-
			
 
				-            if idx not in replaced_indices:
			
 
				-                block_text = text_blocks[idx]["text"]
			
 
				-                if block_text:
			
 
				-                    result_parts.append(block_text)
			
 
				-                    result_parts.append("\n")
			
 
				-            idx += 1
			
 
				-
			
 
				-        return "".join(result_parts).strip() or original_text
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _block_contains_heading(cls, text: str) -> bool:
			
 
				-        if not text or not text.strip():
			
 
				-            return False
			
 
				-
			
 
				-        for line in cls._prepare_page_lines(text):
			
 
				-            stripped = line.strip()
			
 
				-            if not stripped:
			
 
				-                continue
			
 
				-            if cls._matching_rule_names(stripped, "l1") or cls._matching_rule_names(stripped, "l2"):
			
 
				-                return True
			
 
				-        return False
			
 
				-
			
 
				-    def _compress_image(self, img_bytes: bytes) -> bytes:
			
 
				-        """压缩图片"""
			
 
				-        try:
			
 
				-            from PIL import Image
			
 
				-            img = Image.open(io.BytesIO(img_bytes))
			
 
				-
			
 
				-            if img.mode in ('RGBA', 'LA', 'P'):
			
 
				-                background = Image.new('RGB', img.size, (255, 255, 255))
			
 
				-                if img.mode == 'P':
			
 
				-                    img = img.convert('RGBA')
			
 
				-                if img.mode in ('RGBA', 'LA'):
			
 
				-                    background.paste(img, mask=img.split()[-1])
			
 
				-                img = background
			
 
				-            elif img.mode != 'RGB':
			
 
				-                img = img.convert('RGB')
			
 
				-
			
 
				-            min_edge = min(img.size)
			
 
				-            if min_edge > self.MAX_SHORT_EDGE:
			
 
				-                ratio = self.MAX_SHORT_EDGE / min_edge
			
 
				-                new_size = (int(img.width * ratio), int(img.height * ratio))
			
 
				-                img = img.resize(new_size, Image.Resampling.LANCZOS)
			
 
				-
			
 
				-            buffer = io.BytesIO()
			
 
				-            img.save(buffer, format='JPEG', quality=self.JPEG_QUALITY, optimize=True)
			
 
				-            return buffer.getvalue()
			
 
				-
			
 
				-        except Exception as e:
			
 
				-            logger.warning(f"图片压缩失败，使用原图: {e}")
			
 
				-            return img_bytes
			
 
				-
			
 
				-    def _extract_ocr_content(self, result: Dict) -> str:
			
 
				-        """从 OCR 响应提取内容，并将 HTML 表格转换为 Markdown"""
			
 
				-        content = ""
			
 
				-        if "choices" in result and isinstance(result["choices"], list):
			
 
				-            if len(result["choices"]) > 0:
			
 
				-                message = result["choices"][0].get("message", {})
			
 
				-                content = message.get("content", "")
			
 
				-
			
 
				-        # 如果内容包含 HTML 标签，转换为 Markdown
			
 
				-        if content and "<" in content and ">" in content:
			
 
				-            try:
			
 
				-                from ..doc_worker.pdf_worker.html_to_markdown import convert_html_to_markdown
			
 
				-                content = convert_html_to_markdown(content)
			
 
				-            except Exception as e:
			
 
				-                logger.debug(f"HTML 转 Markdown 失败，保留原始内容: {e}")
			
 
				-
			
 
				-        return content
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _is_header_footer(line: str) -> bool:
			
 
				-        compact_line = re.sub(r"\s+", "", line.strip())
			
 
				-        if not compact_line:
			
 
				-            return False
			
 
				-
			
 
				-        heading_prefix = re.match(
			
 
				-            r"^(第[\d一二三四五六七八九十百零两]+[章节部分篇]|[\d]+\.\d+|[\d]+[\.．。、]?|[一二三四五六七八九十百零两]+[、）\)\]]|第[一二三四五六七八九十百零两]+节|【\d+】)",
			
 
				-            compact_line,
			
 
				-        )
			
 
				-
			
 
				-        if compact_line.isdigit():
			
 
				-            return True
			
 
				-
			
 
				-        if (
			
 
				-            compact_line.endswith("有限责任公司")
			
 
				-            or compact_line.endswith("有限公司")
			
 
				-            or compact_line.endswith("股份有限公司")
			
 
				-        ) and not heading_prefix:
			
 
				-            return True
			
 
				-
			
 
				-        if compact_line.endswith("专项施工方案") and not heading_prefix:
			
 
				-            return True
			
 
				-
			
 
				-        return (
			
 
				-            "四川路桥建设集团股份有限公司" in line
			
 
				-            or "T梁运输及安装专项施工方案" in line
			
 
				-            or (
			
 
				-                compact_line.endswith("工程项目")
			
 
				-                and len(compact_line) >= 8
			
 
				-                and not compact_line.startswith("第")
			
 
				-            )
			
 
				-        )
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _matching_rule_names(
			
 
				-        cls,
			
 
				-        line: str,
			
 
				-        level: str,
			
 
				-        rule_names: Optional[List[str]] = None,
			
 
				-    ) -> List[str]:
			
 
				-        clean_line = line.strip()
			
 
				-        if level == "l1":
			
 
				-            clean_line = cls._strip_leading_page_number_from_cn_chapter(clean_line)
			
 
				-        names = rule_names or list(cls.RULE_LIB.keys())
			
 
				-        return [
			
 
				-            rule_name
			
 
				-            for rule_name in names
			
 
				-            if cls.RULE_LIB[rule_name][level].match(clean_line)
			
 
				-        ]
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _matches_chapter_heading(cls, line: str, rule_names: Optional[List[str]] = None) -> bool:
			
 
				-        return bool(cls._matching_rule_names(line, "l1", rule_names))
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _matches_section_heading(cls, line: str, rule_names: Optional[List[str]] = None) -> bool:
			
 
				-        return bool(cls._matching_rule_names(line, "l2", rule_names))
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _strip_leading_page_number_from_cn_chapter(line: str) -> str:
			
 
				-        cleaned = re.sub(r"\s+", " ", line.strip())
			
 
				-        if not cleaned:
			
 
				-            return ""
			
 
				-
			
 
				-        return re.sub(
			
 
				-            r"^\d{1,3}\s+(?=第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇])",
			
 
				-            "",
			
 
				-            cleaned,
			
 
				-            count=1,
			
 
				-        ).strip()
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _clean_chapter_title(line: str) -> str:
			
 
				-        cleaned = PdfStructureExtractor._strip_leading_page_number_from_cn_chapter(line)
			
 
				-        cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
			
 
				-        cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
			
 
				-        cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
			
 
				-        cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				-
			
 
				-        cn_chapter_match = re.match(
			
 
				-            r"^(第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章部分篇])[\s、:：\.-]*(.*)$",
			
 
				-            cleaned,
			
 
				-        )
			
 
				-        if cn_chapter_match:
			
 
				-            prefix = re.sub(r"\s+", "", cn_chapter_match.group(1))
			
 
				-            title = cn_chapter_match.group(2).strip()
			
 
				-            return f"{prefix} {title}".strip()
			
 
				-
			
 
				-        num_chapter_match = re.match(r"^(\d{1,2})(?:[\.．。、])?\s*(.*)$", cleaned)
			
 
				-        if num_chapter_match:
			
 
				-            prefix = num_chapter_match.group(1)
			
 
				-            title = num_chapter_match.group(2).strip()
			
 
				-            return f"{prefix} {title}".strip()
			
 
				-
			
 
				-        return cleaned
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _clean_section_title(line: str) -> str:
			
 
				-        cleaned = line.strip()
			
 
				-        cleaned = PdfStructureExtractor._strip_catalog_page_suffix(cleaned)
			
 
				-        cleaned = re.sub(r"\s+\d+\s*$", "", cleaned)
			
 
				-        cleaned = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*$", "", cleaned)
			
 
				-        cleaned = re.sub(r"\s+", " ", cleaned).strip()
			
 
				-
			
 
				-        numeric_section_match = re.match(r"^(\d+\.\d+)(?!\.\d)\.?\s*(.*)$", cleaned)
			
 
				-        if numeric_section_match:
			
 
				-            prefix = numeric_section_match.group(1)
			
 
				-            title = numeric_section_match.group(2).strip()
			
 
				-            return f"{prefix} {title}".strip()
			
 
				-
			
 
				-        numeric_list_match = re.match(r"^(\d{1,2})(?:[、\.\uFF0E\u3002\)\]\uFF09])\s*(.*)$", cleaned)
			
 
				-        if numeric_list_match:
			
 
				-            prefix = numeric_list_match.group(1)
			
 
				-            title = numeric_list_match.group(2).strip()
			
 
				-            return f"{prefix} {title}".strip()
			
 
				-
			
 
				-        cn_section_match = re.match(r"^(第\s*[一二三四五六七八九十百零两]+\s*节)[\s、:：\.-]*(.*)$", cleaned)
			
 
				-        if cn_section_match:
			
 
				-            prefix = re.sub(r"\s+", "", cn_section_match.group(1))
			
 
				-            title = cn_section_match.group(2).strip()
			
 
				-            return f"{prefix} {title}".strip()
			
 
				-
			
 
				-        cn_list_match = re.match(r"^([一二三四五六七八九十百零两]+[、）\)\]])[\s]*(.*)$", cleaned)
			
 
				-        if cn_list_match:
			
 
				-            prefix = cn_list_match.group(1).strip()
			
 
				-            title = cn_list_match.group(2).strip()
			
 
				-            return f"{prefix} {title}".strip()
			
 
				-
			
 
				-        bracket_match = re.match(r"^([【\[]\s*\d+\s*[\]】])[\s]*(.*)$", cleaned)
			
 
				-        if bracket_match:
			
 
				-            prefix = re.sub(r"\s+", "", bracket_match.group(1))
			
 
				-            title = bracket_match.group(2).strip()
			
 
				-            return f"{prefix} {title}".strip()
			
 
				-
			
 
				-        return cleaned
			
 
				-
			
 
				-    @staticmethod
			
 
				-    def _strip_catalog_page_suffix(text: str) -> str:
			
 
				-        cleaned = re.sub(r"\s+", " ", (text or "").strip())
			
 
				-        if not cleaned:
			
 
				-            return ""
			
 
				-
			
 
				-        return re.sub(
			
 
				-            r"(?:[.\u2026\u00b7\u2022·• ]{2,})[-\u2013\u2014 ]*\d+\s*[-\u2013\u2014 ]*$",
			
 
				-            "",
			
 
				-            cleaned,
			
 
				-        ).strip()
			
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor3.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor3.py
@@ -0,0 +1,481 @@
 
				+"""
			
 
				+PDF 结构提取器 - 同步并发 OCR 版本
			
 
				+
			
 
				+基于 splitter_pdf 逻辑，直接提取章节结构并记录页码。
			
 
				+支持 OCR 增强：检测表格区域并使用 ThreadPoolExecutor 5并发 OCR，其他文本保持 PyMuPDF 提取。
			
 
				+输出格式兼容后续分类与组装流程。
			
 
				+"""
			
 
				+
			
 
				+import re
			
 
				+from typing import Dict, Any, List, Optional, Tuple
			
 
				+
			
 
				+import fitz
			
 
				+
			
 
				+from foundation.observability.logger.loggering import review_logger as logger
			
 
				+
			
 
				+from .ocr_processor import OcrProcessor, TableRegion, OcrResult
			
 
				+
			
 
				+# 尝试导入 RapidLayout
			
 
				+try:
			
 
				+    from rapid_layout import RapidLayout
			
 
				+    RAPID_LAYOUT_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    RAPID_LAYOUT_AVAILABLE = False
			
 
				+    RapidLayout = None
			
 
				+
			
 
				+
			
 
				+class PdfStructureExtractor:
			
 
				+    """PDF 章节结构提取器（支持 OCR 异步并发）"""
			
 
				+
			
 
				+    CHAPTER_PATTERN = re.compile(r'^第[一二三四五六七八九十百]+章\s*.*')
			
 
				+    SECTION_PATTERN = re.compile(r'^[一二三四五六七八九十百]+、\s*.*')
			
 
				+    TOC_PATTERN = re.compile(r"\.{3,}|…{2,}")
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        clip_top: float = 60,
			
 
				+        clip_bottom: float = 60,
			
 
				+        use_ocr: bool = False,
			
 
				+        ocr_api_url: str = "http://183.220.37.46:25429/v1/chat/completions",
			
 
				+        ocr_timeout: int = 600,
			
 
				+        ocr_api_key: str = "",
			
 
				+        detect_toc: bool = True,
			
 
				+        toc_model_path: str = "config/yolo/best.pt",
			
 
				+    ):
			
 
				+        self.clip_top = clip_top
			
 
				+        self.clip_bottom = clip_bottom
			
 
				+        self.use_ocr = use_ocr and RAPID_LAYOUT_AVAILABLE
			
 
				+
			
 
				+        # 初始化 OCR 处理器
			
 
				+        self._ocr_processor = OcrProcessor(
			
 
				+            ocr_api_url=ocr_api_url,
			
 
				+            ocr_timeout=ocr_timeout,
			
 
				+            ocr_api_key=ocr_api_key,
			
 
				+        ) if self.use_ocr else None
			
 
				+
			
 
				+        # 目录检测配置
			
 
				+        self.detect_toc = detect_toc
			
 
				+        self.toc_model_path = toc_model_path
			
 
				+        self._toc_extractor = None
			
 
				+
			
 
				+        if use_ocr and not RAPID_LAYOUT_AVAILABLE:
			
 
				+            logger.warning("RapidLayout 未安装，OCR 功能不可用")
			
 
				+
			
 
				+    def extract(self, file_content: bytes, progress_callback=None) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        从 PDF 字节流提取章节结构。
			
 
				+
			
 
				+        Args:
			
 
				+            file_content: PDF 文件字节流
			
 
				+            progress_callback: 进度回调函数，接收 (stage, current, message) 参数
			
 
				+
			
 
				+        Returns:
			
 
				+            {
			
 
				+                "chapters": {
			
 
				+                    "第一章 xxx": {
			
 
				+                        "章节标题": {"content": "...", "page_start": 1, "page_end": 1},
			
 
				+                        "一、xxx": {"content": "...", "page_start": 2, "page_end": 3},
			
 
				+                    }
			
 
				+                },
			
 
				+                "total_pages": N,
			
 
				+                "catalog": {  # 目录结构（YOLO检测+OCR提取）
			
 
				+                    "chapters": [...],
			
 
				+                    "total_chapters": N
			
 
				+                }
			
 
				+            }
			
 
				+        """
			
 
				+        result = {"chapters": {}, "total_pages": 0, "catalog": None}
			
 
				+
			
 
				+        # === 阶段0: 目录页检测与提取（如果启用）===
			
 
				+        if self.detect_toc:
			
 
				+            try:
			
 
				+                catalog = self._extract_catalog(file_content, progress_callback)
			
 
				+                if catalog:
			
 
				+                    result["catalog"] = catalog
			
 
				+                    logger.info(f"[PDF提取] 目录提取完成: {catalog.get('total_chapters', 0)} 章")
			
 
				+            except Exception as e:
			
 
				+                logger.warning(f"[PDF提取] 目录提取失败: {e}")
			
 
				+
			
 
				+        # === 阶段1-3: 文档结构提取 ===
			
 
				+        doc = fitz.open(stream=file_content)
			
 
				+        try:
			
 
				+            structure = self._extract_from_doc(doc, progress_callback)
			
 
				+            result["chapters"] = structure.get("chapters", {})
			
 
				+            result["total_pages"] = len(doc)
			
 
				+            return result
			
 
				+        finally:
			
 
				+            doc.close()
			
 
				+
			
 
				+    def _extract_catalog(self, file_content: bytes, progress_callback=None) -> Optional[Dict[str, Any]]:
			
 
				+        """
			
 
				+        提取目录结构（YOLO检测 + OCR识别）
			
 
				+
			
 
				+        Returns:
			
 
				+            {"chapters": [...], "total_chapters": N} 或 None
			
 
				+        """
			
 
				+        # 延迟导入避免循环依赖（YOLO依赖必须存在，否则报错）
			
 
				+        from .toc_detector import TOCCatalogExtractor
			
 
				+
			
 
				+        if self._toc_extractor is None:
			
 
				+            # 使用 OCR 处理器的配置（如果已初始化）
			
 
				+            ocr_config = {}
			
 
				+            if self._ocr_processor:
			
 
				+                ocr_config = {
			
 
				+                    "ocr_api_url": self._ocr_processor.ocr_api_url,
			
 
				+                    "ocr_api_key": self._ocr_processor.ocr_api_key,
			
 
				+                    "ocr_timeout": self._ocr_processor.ocr_timeout,
			
 
				+                }
			
 
				+            self._toc_extractor = TOCCatalogExtractor(
			
 
				+                model_path=self.toc_model_path,
			
 
				+                **ocr_config
			
 
				+            )
			
 
				+
			
 
				+        return self._toc_extractor.detect_and_extract(file_content, progress_callback)
			
 
				+
			
 
				+    def _extract_from_doc(self, doc: fitz.Document, progress_callback=None) -> Dict[str, Any]:
			
 
				+        """
			
 
				+        提取文档结构（支持 OCR 异步并发）- 带坐标的精准回填方案。
			
 
				+
			
 
				+        流程：
			
 
				+        1. 提取带坐标的文本块
			
 
				+        2. 章节标题匹配 + 块归属划分
			
 
				+        3. 扫描表格区域并 OCR
			
 
				+        4. 根据表格坐标，将其作为新的块插入到对应小节
			
 
				+        5. 将每个小节的块列表按顺序拼接成纯文本输出
			
 
				+        """
			
 
				+
			
 
				+        def _emit_progress(stage: str, current: int, message: str):
			
 
				+            """发送进度回调"""
			
 
				+            if progress_callback:
			
 
				+                try:
			
 
				+                    progress_callback(stage, current, message)
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+
			
 
				+        total_pages = len(doc)
			
 
				+
			
 
				+        # ==================== 阶段1: 提取带坐标的文本块并归属到章节/小节====================
			
 
				+        logger.info("[阶段1] 提取带坐标的文本块并归属章节...")
			
 
				+
			
 
				+        # 数据结构: {(chapter_name, section_name): [blocks_with_position]}
			
 
				+        chapter_blocks: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
			
 
				+        current_chapter = "未分类前言"
			
 
				+        current_section = "默认部分"
			
 
				+        in_body = False
			
 
				+
			
 
				+        for page_num in range(total_pages):
			
 
				+            page = doc.load_page(page_num)
			
 
				+            rect = page.rect
			
 
				+            clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				+
			
 
				+            # 获取带坐标的文本块
			
 
				+            blocks = self._extract_text_blocks_with_position(page, clip_box)
			
 
				+
			
 
				+            for block in blocks:
			
 
				+                line = block["text"]
			
 
				+
			
 
				+                # 跳过空行和页眉页脚
			
 
				+                if not line.strip():
			
 
				+                    continue
			
 
				+                if self._is_header_footer(line):
			
 
				+                    continue
			
 
				+
			
 
				+                # 跳过目录阶段
			
 
				+                if not in_body:
			
 
				+                    if self.CHAPTER_PATTERN.match(line) and not self.TOC_PATTERN.search(line):
			
 
				+                        in_body = True
			
 
				+                    else:
			
 
				+                        continue
			
 
				+
			
 
				+                # 跳过残余目录格式
			
 
				+                if self.TOC_PATTERN.search(line):
			
 
				+                    continue
			
 
				+
			
 
				+                # 匹配章标题
			
 
				+                if self.CHAPTER_PATTERN.match(line):
			
 
				+                    current_chapter = self._clean_chapter_title(line)
			
 
				+                    current_section = "章节标题"
			
 
				+                    key = (current_chapter, current_section)
			
 
				+                    if key not in chapter_blocks:
			
 
				+                        chapter_blocks[key] = []
			
 
				+                    chapter_blocks[key].append(block)
			
 
				+                    continue
			
 
				+
			
 
				+                # 匹配节标题
			
 
				+                if self.SECTION_PATTERN.match(line):
			
 
				+                    current_section = line
			
 
				+                    key = (current_chapter, current_section)
			
 
				+                    if key not in chapter_blocks:
			
 
				+                        chapter_blocks[key] = []
			
 
				+                    chapter_blocks[key].append(block)
			
 
				+                    continue
			
 
				+
			
 
				+                # 普通内容块
			
 
				+                key = (current_chapter, current_section)
			
 
				+                if key not in chapter_blocks:
			
 
				+                    chapter_blocks[key] = []
			
 
				+                chapter_blocks[key].append(block)
			
 
				+
			
 
				+        logger.info(f"[阶段1] 章节结构提取完成，共 {len({k[0] for k in chapter_blocks})} 个章节")
			
 
				+
			
 
				+        # ==================== 阶段2: 收集表格区域并OCR（如果启用OCR）====================
			
 
				+        table_regions: List[TableRegion] = []
			
 
				+        ocr_results: List[OcrResult] = []
			
 
				+
			
 
				+        if self.use_ocr and self._ocr_processor:
			
 
				+            logger.info("[阶段2] 扫描表格区域...")
			
 
				+            for page_num in range(total_pages):
			
 
				+                page = doc.load_page(page_num)
			
 
				+                rect = page.rect
			
 
				+                clip_box = fitz.Rect(0, self.clip_top, rect.width, rect.height - self.clip_bottom)
			
 
				+                regions = self._ocr_processor.detect_table_regions(page, page_num + 1, clip_box)
			
 
				+                for bbox, score in regions:
			
 
				+                    table_regions.append(TableRegion(
			
 
				+                        page_num=page_num + 1,
			
 
				+                        page=page,
			
 
				+                        bbox=bbox,
			
 
				+                        score=score
			
 
				+                    ))
			
 
				+                # 每5页推送进度
			
 
				+                if (page_num + 1) % 5 == 0 or page_num == total_pages - 1:
			
 
				+                    progress = int((page_num + 1) / total_pages * 30)
			
 
				+                    _emit_progress("版面分析", progress, f"扫描页面 {page_num + 1}/{total_pages}")
			
 
				+
			
 
				+            logger.info(f"[阶段2] 发现 {len(table_regions)} 个表格区域")
			
 
				+
			
 
				+            # 执行OCR
			
 
				+            if table_regions:
			
 
				+                _emit_progress("版面分析", 35, f"发现 {len(table_regions)} 个表格，开始OCR识别...")
			
 
				+                ocr_results = self._ocr_processor.process_ocr_concurrent(
			
 
				+                    table_regions,
			
 
				+                    progress_callback=lambda completed, total: _emit_progress(
			
 
				+                        "版面分析", 35 + int(completed / total * 15), f"OCR识别中 {completed}/{total}"
			
 
				+                    )
			
 
				+                )
			
 
				+                success_count = sum(1 for r in ocr_results if r.success)
			
 
				+                logger.info(f"[阶段2] OCR完成 {success_count}/{len(table_regions)}")
			
 
				+                _emit_progress("版面分析", 50, f"OCR识别完成 {success_count}/{len(table_regions)}")
			
 
				+
			
 
				+        # ==================== 阶段3: 将OCR结果作为新块插入到对应章节====================
			
 
				+        if ocr_results:
			
 
				+            logger.info("[阶段3] 将OCR结果回填到对应章节...")
			
 
				+            self._insert_ocr_blocks_into_chapters(chapter_blocks, ocr_results)
			
 
				+
			
 
				+        # ==================== 阶段4: 生成最终输出（块列表转纯文本）====================
			
 
				+        logger.info("[阶段4] 生成最终文本输出...")
			
 
				+        result: Dict[str, Any] = {"chapters": {}}
			
 
				+
			
 
				+        for (chap_name, sec_name), blocks in chapter_blocks.items():
			
 
				+            if chap_name not in result["chapters"]:
			
 
				+                result["chapters"][chap_name] = {}
			
 
				+
			
 
				+            # 按页码和Y坐标排序块
			
 
				+            blocks.sort(key=lambda b: (b["page"], b["bbox"][1]))
			
 
				+
			
 
				+            # 拼接文本
			
 
				+            lines = []
			
 
				+            page_start = blocks[0]["page"] if blocks else 1
			
 
				+            page_end = blocks[-1]["page"] if blocks else 1
			
 
				+
			
 
				+            for block in blocks:
			
 
				+                if block.get("type") == "table":
			
 
				+                    lines.append(f"\n[表格OCR识别结果]:\n{block['text']}\n[/表格]\n")
			
 
				+                else:
			
 
				+                    lines.append(block["text"])
			
 
				+
			
 
				+            result["chapters"][chap_name][sec_name] = {
			
 
				+                "content": "\n".join(lines),
			
 
				+                "page_start": page_start,
			
 
				+                "page_end": page_end,
			
 
				+            }
			
 
				+
			
 
				+        logger.info(f"[PdfExtractor] 提取完成，共 {len(result['chapters'])} 个章节")
			
 
				+        return result
			
 
				+
			
 
				+    def _extract_text_blocks_with_position(
			
 
				+        self,
			
 
				+        page: fitz.Page,
			
 
				+        clip_box: fitz.Rect
			
 
				+    ) -> List[Dict[str, Any]]:
			
 
				+        """
			
 
				+        提取带坐标的文本块列表。
			
 
				+
			
 
				+        使用 page.get_text("dict") 获取每个文本块的精确边界框和文本内容。
			
 
				+        """
			
 
				+        blocks = []
			
 
				+        page_dict = page.get_text("dict", clip=clip_box)
			
 
				+
			
 
				+        for block in page_dict.get("blocks", []):
			
 
				+            if block.get("type") == 0:  # 文本块
			
 
				+                bbox = block["bbox"]
			
 
				+                y_center = (bbox[1] + bbox[3]) / 2
			
 
				+
			
 
				+                # 拼接块内所有文本
			
 
				+                text_lines = []
			
 
				+                for line in block.get("lines", []):
			
 
				+                    line_text = ""
			
 
				+                    for span in line.get("spans", []):
			
 
				+                        line_text += span.get("text", "")
			
 
				+                    if line_text.strip():
			
 
				+                        text_lines.append(line_text)
			
 
				+
			
 
				+                if text_lines:
			
 
				+                    blocks.append({
			
 
				+                        "text": "\n".join(text_lines),
			
 
				+                        "page": page.number + 1,
			
 
				+                        "bbox": bbox,
			
 
				+                        "y_center": y_center,
			
 
				+                        "type": "text"
			
 
				+                    })
			
 
				+
			
 
				+        # 按阅读顺序排序（Y坐标为主，X坐标为辅）
			
 
				+        blocks.sort(key=lambda b: (b["page"], b["bbox"][1], b["bbox"][0]))
			
 
				+        return blocks
			
 
				+
			
 
				+    def _insert_ocr_blocks_into_chapters(
			
 
				+        self,
			
 
				+        chapter_blocks: Dict[Tuple[str, str], List[Dict[str, Any]]],
			
 
				+        ocr_results: List[OcrResult]
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        将OCR结果作为新的块插入到对应章节。
			
 
				+
			
 
				+        策略：
			
 
				+        1. 找到表格Y坐标所在的页面
			
 
				+        2. 在该页面的所有小节中，找到表格Y坐标介于哪两个文本块之间
			
 
				+        3. 将OCR块插入到正确位置
			
 
				+        """
			
 
				+        # 按页码分组OCR结果
			
 
				+        ocr_by_page: Dict[int, List[OcrResult]] = {}
			
 
				+        for result in ocr_results:
			
 
				+            if result.success:
			
 
				+                if result.page_num not in ocr_by_page:
			
 
				+                    ocr_by_page[result.page_num] = []
			
 
				+                ocr_by_page[result.page_num].append(result)
			
 
				+
			
 
				+        # 处理每个包含表格的页面
			
 
				+        for page_num, ocr_list in ocr_by_page.items():
			
 
				+            # 找到该页面涉及的所有小节
			
 
				+            page_sections = []
			
 
				+            for (chap_name, sec_name), blocks in chapter_blocks.items():
			
 
				+                # 检查该小节是否包含该页面的块
			
 
				+                page_blocks = [b for b in blocks if b["page"] == page_num]
			
 
				+                if page_blocks:
			
 
				+                    page_sections.append({
			
 
				+                        "chapter": chap_name,
			
 
				+                        "section": sec_name,
			
 
				+                        "blocks": page_blocks,
			
 
				+                        "all_blocks": blocks,  # 引用原列表用于插入
			
 
				+                    })
			
 
				+
			
 
				+            if not page_sections:
			
 
				+                logger.warning(f"[OCR回填] 第{page_num}页没有匹配到任何小节")
			
 
				+                continue
			
 
				+
			
 
				+            # 处理每个OCR结果
			
 
				+            for ocr_result in sorted(ocr_list, key=lambda r: r.bbox[1]):
			
 
				+                table_y_top = ocr_result.bbox[1]
			
 
				+                table_y_bottom = ocr_result.bbox[3]
			
 
				+                ocr_text = ocr_result.text
			
 
				+
			
 
				+                # 构造表格块
			
 
				+                table_block = {
			
 
				+                    "text": ocr_text,
			
 
				+                    "page": page_num,
			
 
				+                    "bbox": ocr_result.bbox,
			
 
				+                    "y_center": (table_y_top + table_y_bottom) / 2,
			
 
				+                    "type": "table"
			
 
				+                }
			
 
				+
			
 
				+                # 找到目标小节
			
 
				+                target_section = None
			
 
				+                insert_index = -1
			
 
				+
			
 
				+                for ps in page_sections:
			
 
				+                    # 获取该小节在该页面的所有块，按Y坐标排序
			
 
				+                    page_blocks = sorted(ps["blocks"], key=lambda b: b["bbox"][1])
			
 
				+
			
 
				+                    if not page_blocks:
			
 
				+                        continue
			
 
				+
			
 
				+                    # 找到表格应该插入的位置
			
 
				+                    # 策略：表格上边界位于哪个块之后
			
 
				+                    found = False
			
 
				+                    for i, block in enumerate(page_blocks):
			
 
				+                        block_y_bottom = block["bbox"][3]
			
 
				+                        if i < len(page_blocks) - 1:
			
 
				+                            next_y_top = page_blocks[i + 1]["bbox"][1]
			
 
				+                        else:
			
 
				+                            next_y_top = float('inf')
			
 
				+
			
 
				+                        # 如果表格位于当前块之后，且在下一块之前
			
 
				+                        if block_y_bottom <= table_y_top < next_y_top:
			
 
				+                            # 找到在原列表中的位置
			
 
				+                            try:
			
 
				+                                insert_index = ps["all_blocks"].index(block) + 1
			
 
				+                                target_section = ps
			
 
				+                                found = True
			
 
				+                                break
			
 
				+                            except ValueError:
			
 
				+                                continue
			
 
				+
			
 
				+                    # 如果表格在所有块之前
			
 
				+                    if not found and table_y_top < page_blocks[0]["bbox"][1]:
			
 
				+                        try:
			
 
				+                            insert_index = ps["all_blocks"].index(page_blocks[0])
			
 
				+                            target_section = ps
			
 
				+                            found = True
			
 
				+                        except ValueError:
			
 
				+                            continue
			
 
				+
			
 
				+                    # 如果表格在所有块之后
			
 
				+                    if not found and table_y_bottom > page_blocks[-1]["bbox"][3]:
			
 
				+                        try:
			
 
				+                            insert_index = ps["all_blocks"].index(page_blocks[-1]) + 1
			
 
				+                            target_section = ps
			
 
				+                            found = True
			
 
				+                        except ValueError:
			
 
				+                            continue
			
 
				+
			
 
				+                    if found:
			
 
				+                        break
			
 
				+
			
 
				+                # 执行插入
			
 
				+                if target_section and insert_index >= 0:
			
 
				+                    target_section["all_blocks"].insert(insert_index, table_block)
			
 
				+                    logger.debug(
			
 
				+                        f"[OCR回填] 第{page_num}页表格(Y={table_y_top:.0f}) -> "
			
 
				+                        f"{target_section['chapter']}/{target_section['section']} 位置{insert_index}"
			
 
				+                    )
			
 
				+                else:
			
 
				+                    # 兜底：追加到该页面第一个小节末尾
			
 
				+                    if page_sections:
			
 
				+                        ps = page_sections[0]
			
 
				+                        ps["all_blocks"].append(table_block)
			
 
				+                        logger.warning(
			
 
				+                            f"[OCR回填] 第{page_num}页表格无法精确定位，追加到 {ps['chapter']}/{ps['section']}"
			
 
				+                        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _is_header_footer(line: str) -> bool:
			
 
				+        return (
			
 
				+            "四川路桥建设集团股份有限公司" in line
			
 
				+            or "T梁运输及安装专项施工方案" in line
			
 
				+            or line.isdigit()
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def _clean_chapter_title(line: str) -> str:
			
 
				+        chapter_match = re.search(r"第[一二三四五六七八九十百]+章", line)
			
 
				+        if not chapter_match:
			
 
				+            return line.strip()
			
 
				+
			
 
				+        prefix = chapter_match.group(0)
			
 
				+        remaining = line[chapter_match.end() :].strip()
			
 
				+        remaining = re.sub(r"^[\.\s]+", "", remaining)
			
 
				+        remaining = re.sub(r"\s+\d+\s*$", "", remaining)
			
 
				+        remaining = re.sub(r"[\._\-]{3,}[^\u4e00-\u9fa5a-zA-Z0-9]*", "", remaining)
			
 
				+
			
 
				+        if remaining:
			
 
				+            return f"{prefix} {remaining}"
			
 
				+        return prefix
			
--- a/core/construction_review/component/minimal_pipeline/simple_processor.py
+++ b/core/construction_review/component/minimal_pipeline/simple_processor.py
@@ -17,8 +17,8 @@ from typing import Dict, Any, Optional, Tuple, List
 
				 from foundation.observability.logger.loggering import review_logger as logger
			
 
				 from foundation.observability.cachefiles import cache, CacheBaseDir
			
 
				 
			
 
				-from .pdf_extractor2 import PdfStructureExtractor
			
 
				-#from .pdf_extractor import PdfStructureExtractor
			
 
				+#from .pdf_extractor2 import PdfStructureExtractor
			
 
				+from .pdf_extractor import PdfStructureExtractor
			
 
				 from .toc_builder import build_toc_items_from_structure
			
 
				 from .chunk_assembler import assemble_chunks
			
 
				 from ..doc_worker.classification.hierarchy_classifier import HierarchyClassifier
			
@@ -479,8 +479,9 @@ class SimpleDocumentProcessor:
 
				             l2_threshold: 二级小节提取率阈值
			
 
				         """
			
 
				         chapters = structure.get("chapters", {})
			
 
				-        if not chapters:
			
 
				-            return
			
 
				+        # 确保 chapters 存在（即使为空），以便添加 quality_check
			
 
				+        if "chapters" not in structure:
			
 
				+            structure["chapters"] = chapters
			
 
				 
			
 
				         # 统计一级章节数量
			
 
				         l1_count = len(chapters)
			
--- a/core/construction_review/component/reviewers/check_completeness/components/result_analyzer.py
+++ b/core/construction_review/component/reviewers/check_completeness/components/result_analyzer.py
@@ -255,7 +255,7 @@ class ResultAnalyzer(IResultAnalyzer):
 
				         for row in summary_rows:
			
 
				             level2 = (row.get("二级目录") or "").strip()
			
 
				             requirement = (row.get("内容要求") or "").strip()
			
 
				-            reference_source = '《桥梁公司危险性较大工程管理实施细则（2025版）》'
			
 
				+            reference_source = '交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)'
			
 
				             reason= f"参照：{reference_source} 中的内容要求，{row.get('section_label', '')}内容属于,专项施工方案内容要求中的 【{suorces_eum[row.get('标签', '')]}】 板块，应包含{requirement}"
			
 
				             review_references = (row.get("依据") or "").strip()
			
 
				             if level2 in row.get("content", ""):
			
@@ -295,7 +295,7 @@ class ResultAnalyzer(IResultAnalyzer):
 
				                     "issue_point": issue_point,
			
 
				                     "location": row.get("section_label", ""),
			
 
				                     "suggestion": suggestion,
			
 
				-                    "reason": f"根据《桥梁公司危险性较大工程管理实施细则（2025版）》，{section_label}的'{level2_name}'应包含：{requirement}。当前缺失：{missing_content_text}",
			
 
				+                    "reason": f"根据交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)，{section_label}的'{level2_name}'应包含：{requirement}。当前缺失：{missing_content_text}",
			
 
				                     "risk_level": risk_level
			
 
				                 },
			
 
				                 "exist_issue": True,
			
--- a/core/construction_review/component/reviewers/completeness_reviewer.py
+++ b/core/construction_review/component/reviewers/completeness_reviewer.py
@@ -276,7 +276,7 @@ class LightweightCompletenessChecker:
 
				 
			
 
				             reference = f"""
			
 
				 【规范参考信息】
			
 
				-根据《桥梁公司危险性较大工程管理实施细则（2025版）》，'{first_name}'章节应包含以下内容：
			
 
				+根据交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)，'{first_name}'章节应包含以下内容：
			
 
				 {chr(10).join(related_specs)}
			
 
				 """
			
 
				 
			
@@ -294,7 +294,7 @@ class LightweightCompletenessChecker:
 
				 
			
 
				             reference = f"""
			
 
				 【规范参考信息】
			
 
				-根据《桥梁公司危险性较大工程管理实施细则（2025版）》，'{second_name}'章节应包含以下三级内容要点：
			
 
				+根据交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)，'{second_name}'章节应包含以下三级内容要点：
			
 
				 {chr(10).join(tertiary_info)}
			
 
				 """
			
 
				 
			
@@ -962,7 +962,7 @@ JSON输出："""
 
				             if first_code not in actual_first:
			
 
				                 # issue_point 和 reason 使用简单拼接
			
 
				                 issue_point = f"【一级章节缺失】'{first_name}'整个章节不存在"
			
 
				-                reason = f"依据《桥梁公司危险性较大工程管理实施细则（2025版）》规定，文档必须包含'{first_name}'一级章节，当前正文中未发现该章节任何内容"
			
 
				+                reason = f"依据交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)规定，文档必须包含'{first_name}'一级章节，当前正文中未发现该章节任何内容"
			
 
				 
			
 
				                 # 尝试使用LLM生成 suggestion
			
 
				                 llm_result = await self._generate_recommendation_with_llm(
			
@@ -1005,7 +1005,7 @@ JSON输出："""
 
				 
			
 
				                     # issue_point 和 reason 使用简单拼接
			
 
				                     issue_point = f"【二级章节缺失】{actual_first_name} > '{second_name}'整个章节不存在"
			
 
				-                    reason = f"依据《桥梁公司危险性较大工程管理实施细则（2025版）》规定，'{actual_first_name}'下应包含'{second_name}'二级章节，当前正文中未发现该章节内容"
			
 
				+                    reason = f"依据交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)规定，'{actual_first_name}'下应包含'{second_name}'二级章节，当前正文中未发现该章节内容"
			
 
				 
			
 
				                     # 尝试使用LLM生成 suggestion
			
 
				                     llm_result = await self._generate_recommendation_with_llm(
			
@@ -1081,7 +1081,7 @@ JSON输出："""
 
				                         "issue_point": f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'",
			
 
				                         "location": actual_second_name,  # 三级缺失定位到二级小节
			
 
				                         "suggestion": suggestion,
			
 
				-                        "reason": f"依据《桥梁公司危险性较大工程管理实施细则（2025版）》规定，'{second_name}'下应包含'{t_item.third_cn}'内容要点",
			
 
				+                        "reason": f"依据交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)规定，'{second_name}'下应包含'{t_item.third_cn}'内容要点",
			
 
				                         "first_seq": first_seq,
			
 
				                         "second_seq": second_seq,
			
 
				                         "third_seq": t_item.third_seq,
			
@@ -1103,7 +1103,7 @@ JSON输出："""
 
				 
			
 
				                 # issue_point 和 reason 使用简单拼接（一致性审查）
			
 
				                 issue_point = f"【目录正文不一致】'{location}'目录已列但正文无内容"
			
 
				-                reason = f"依据《桥梁公司危险性较大工程管理实施细则（2025版）》规定，目录应与正文保持一致。目录页列有'{sec_title}'章节，但正文中未发现对应内容"
			
 
				+                reason = f"依据交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)规定，目录应与正文保持一致。目录页列有'{sec_title}'章节，但正文中未发现对应内容"
			
 
				 
			
 
				                 # 尝试使用LLM生成 suggestion
			
 
				                 llm_result = await self._generate_recommendation_with_llm(
			
@@ -1138,7 +1138,7 @@ JSON输出："""
 
				                 "issue_point": "文档完整性良好",
			
 
				                 "location": "",
			
 
				                 "suggestion": "无需补充",
			
 
				-                "reason": "依据《桥梁公司危险性较大工程管理实施细则（2025版）》规定，文档已覆盖所有章节与内容要点",
			
 
				+                "reason": "依据交通运输部《公路水运危险性较大工程专项施工方案编制审查规程》(JT/T 1495—2024)规定，文档已覆盖所有章节与内容要点",
			
 
				             })
			
 
				 
			
 
				         return recommendations
Автор	SHA1 Сообщение	Дата
WangXuMing	00336af860 docs: 更新基础架构设计文档至 v3.0，合并冲突解决	1 неделя назад
tangle	fabc646a49 Merge branch 'dev-new-rules' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	1 неделя назад
tangle	aeeaa785e9 fix(更换新的pdf_extractor文件)	1 неделя назад
tangle	b93cd363db fix(更换新的pdf_extractor文件)	1 неделя назад
WangXuMing	a289e0af6f Merge branch 'dev_sgsc_wxm' of CRBC-MaaS-Platform-Project/LQAgentPlatform into dev	1 неделя назад