2 週間前 · a75703114d
--- a/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
+++ b/core/construction_review/component/minimal_pipeline/pdf_extractor1.py
@@ -23,7 +23,18 @@ EMPTY_SECTION_PLACEHOLDER = "[本节无纯文本，原文档中可能为纯图
 
				 TABLE_OCR_START = "[表格OCR识别结果]:"
			
 
				 TABLE_OCR_END = "[/表格]"
			
 
				 CN_LIST_L1_NUMERIC_L2_RULE = "Rule_8_中文序号章数字小节派"
			
 
				-
			
 
				+CATALOG_LIST = {
			
 
				+    '编制依据':['法律法规','标准范围','文件制度','编制原则','编制范围'],
			
 
				+    '工程概况':['设计概况','工程地质与水文气象','周边环境','施工平面及立面布置','施工要求和技术保证条件','风险辨别与分级','参建各方责任主体单位'],
			
 
				+    '施工计划':['施工进度计划','施工材料计划','施工设备计划','劳动力计划','安全生产费用使用计划'],
			
 
				+    '施工工艺技术':['主要施工方法概述','技术参数','施工准备','工艺流程','施工方法及操作要求','检查要求'],
			
 
				+    '安全保证措施':['安全保证体系','组织保证措施','技术保证措施','监测监控措施','应急处置措施'],
			
 
				+    '质量保证措施':['质量保证体系','质量目标','工程创优规划','质量控制程序与具体措施'],
			
 
				+    '环境保证措施':['环境保证体系','环境保护组织机构','环境保护及文明施工措施'],
			
 
				+    '施工管理及作业人员配备与分工':['施工管理人员','专职安全生产管理人员','特种作业人员','其它作业人员'],
			
 
				+    '验收要求':['验收标准','验收程序','验收内容','验收时间','验收人员'],
			
 
				+    '其它资料':['计算书','相关施工图纸','附图附表','编制及审核人员情况']
			
 
				+}
			
 
				 
			
 
				 @dataclass(frozen=True)
			
 
				 class BodyLine:
			
@@ -786,9 +797,10 @@ class PdfStructureExtractor:
 
				                 continue
			
 
				 
			
 
				             has_toc = self._is_toc_line(line)
			
 
				+            has_unconfirmed_toc = self._looks_like_toc_candidate(line) and not has_toc
			
 
				 
			
 
				             match_l1 = rule_set["l1"].match(line)
			
 
				-            if match_l1 and not has_toc:
			
 
				+            if match_l1 and not has_toc and not has_unconfirmed_toc:
			
 
				                 core_text = self._blind_strip(line)
			
 
				                 if len(core_text) < 2:
			
 
				                     pending_prefix = line
			
@@ -858,7 +870,7 @@ class PdfStructureExtractor:
 
				                     chapter_l2_style_hint = self._detect_cn_order_l2_style(line)
			
 
				 
			
 
				             match_l2 = rule_set["l2"].match(line)
			
 
				-            if current_l1 and match_l2 and not has_toc:
			
 
				+            if current_l1 and match_l2 and not has_toc and not has_unconfirmed_toc:
			
 
				                 if self._is_valid_heading_strict(line, is_l1=False):
			
 
				                     if is_numeric_l2:
			
 
				                         l2_main_num = int(match_l2.group(1))
			
@@ -1417,9 +1429,66 @@ class PdfStructureExtractor:
 
				         """判断一行文本是否像目录行。"""
			
 
				 
			
 
				         clean_line = str(line or "").strip()
			
 
				-        if cls.TOC_PATTERN.search(clean_line):
			
 
				+        if not clean_line:
			
 
				+            return False
			
 
				+
			
 
				+        compact = re.sub(r"\s+", "", clean_line)
			
 
				+        if compact in {"目录", "目", "录"}:
			
 
				             return True
			
 
				-        return bool(re.search(r"\s{2,}\d{1,3}$", clean_line))
			
 
				+
			
 
				+        if not cls._looks_like_toc_candidate(clean_line):
			
 
				+            return False
			
 
				+
			
 
				+        return cls._is_standard_catalog_name(clean_line)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _looks_like_toc_candidate(cls, line: str) -> bool:
			
 
				+        """只判断文本形态是否像目录行，不代表它已通过标准目录校验。"""
			
 
				+
			
 
				+        clean_line = str(line or "").strip()
			
 
				+        if not clean_line:
			
 
				+            return False
			
 
				+        return bool(
			
 
				+            cls.TOC_PATTERN.search(clean_line)
			
 
				+            or re.search(r"\s{2,}\d{1,3}$", clean_line)
			
 
				+        )
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _is_standard_catalog_name(cls, line: str) -> bool:
			
 
				+        """目录候选去掉编号和页码后，必须命中标准目录名才当作目录行。"""
			
 
				+
			
 
				+        normalized = cls._normalize_catalog_name(line)
			
 
				+        if not normalized:
			
 
				+            return False
			
 
				+
			
 
				+        for chapter_title, section_titles in CATALOG_LIST.items():
			
 
				+            if normalized == cls._normalize_catalog_name(chapter_title):
			
 
				+                return True
			
 
				+            if any(normalized == cls._normalize_catalog_name(section_title) for section_title in section_titles):
			
 
				+                return True
			
 
				+        return False
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _normalize_catalog_name(cls, text: str) -> str:
			
 
				+        """去掉目录编号、页码、点线和空白，只保留用于标准目录比对的标题名。"""
			
 
				+
			
 
				+        cleaned = cls._strip_catalog_page_suffix(text)
			
 
				+        cleaned = re.sub(r"\s+", " ", str(cleaned or "").strip())
			
 
				+        if not cleaned:
			
 
				+            return ""
			
 
				+
			
 
				+        prefix_patterns = (
			
 
				+            r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部部分篇][\s、：:.-]*",
			
 
				+            r"^\d+(?:\.\d+)*(?:[\.．、）)\]]|\s+|(?=[\u4e00-\u9fa5]))",
			
 
				+            r"^[一二三四五六七八九十百零两]+(?:[、）)\]]|\s+|(?=[\u4e00-\u9fa5]))",
			
 
				+            r"^[（(]\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[）)]\s*",
			
 
				+            r"^[【\[]\s*\d+\s*[\]】]\s*",
			
 
				+        )
			
 
				+        for pattern in prefix_patterns:
			
 
				+            cleaned = re.sub(pattern, "", cleaned, count=1).strip()
			
 
				+
			
 
				+        cleaned = re.sub(r"\s+", "", cleaned)
			
 
				+        return cleaned.strip("：:、.．")
			
 
				 
			
 
				     @classmethod
			
 
				     def _is_header_footer(cls, line: str) -> bool: