ソースを参照

验证标准目录

tangle 2 週間 前
コミット
a75703114d

+ 74 - 5
core/construction_review/component/minimal_pipeline/pdf_extractor1.py

@@ -23,7 +23,18 @@ EMPTY_SECTION_PLACEHOLDER = "[本节无纯文本,原文档中可能为纯图
 TABLE_OCR_START = "[表格OCR识别结果]:"
 TABLE_OCR_END = "[/表格]"
 CN_LIST_L1_NUMERIC_L2_RULE = "Rule_8_中文序号章数字小节派"
-
+CATALOG_LIST = {
+    '编制依据':['法律法规','标准范围','文件制度','编制原则','编制范围'],
+    '工程概况':['设计概况','工程地质与水文气象','周边环境','施工平面及立面布置','施工要求和技术保证条件','风险辨别与分级','参建各方责任主体单位'],
+    '施工计划':['施工进度计划','施工材料计划','施工设备计划','劳动力计划','安全生产费用使用计划'],
+    '施工工艺技术':['主要施工方法概述','技术参数','施工准备','工艺流程','施工方法及操作要求','检查要求'],
+    '安全保证措施':['安全保证体系','组织保证措施','技术保证措施','监测监控措施','应急处置措施'],
+    '质量保证措施':['质量保证体系','质量目标','工程创优规划','质量控制程序与具体措施'],
+    '环境保证措施':['环境保证体系','环境保护组织机构','环境保护及文明施工措施'],
+    '施工管理及作业人员配备与分工':['施工管理人员','专职安全生产管理人员','特种作业人员','其它作业人员'],
+    '验收要求':['验收标准','验收程序','验收内容','验收时间','验收人员'],
+    '其它资料':['计算书','相关施工图纸','附图附表','编制及审核人员情况']
+}
 
 @dataclass(frozen=True)
 class BodyLine:
@@ -786,9 +797,10 @@ class PdfStructureExtractor:
                 continue
 
             has_toc = self._is_toc_line(line)
+            has_unconfirmed_toc = self._looks_like_toc_candidate(line) and not has_toc
 
             match_l1 = rule_set["l1"].match(line)
-            if match_l1 and not has_toc:
+            if match_l1 and not has_toc and not has_unconfirmed_toc:
                 core_text = self._blind_strip(line)
                 if len(core_text) < 2:
                     pending_prefix = line
@@ -858,7 +870,7 @@ class PdfStructureExtractor:
                     chapter_l2_style_hint = self._detect_cn_order_l2_style(line)
 
             match_l2 = rule_set["l2"].match(line)
-            if current_l1 and match_l2 and not has_toc:
+            if current_l1 and match_l2 and not has_toc and not has_unconfirmed_toc:
                 if self._is_valid_heading_strict(line, is_l1=False):
                     if is_numeric_l2:
                         l2_main_num = int(match_l2.group(1))
@@ -1417,9 +1429,66 @@ class PdfStructureExtractor:
         """判断一行文本是否像目录行。"""
 
         clean_line = str(line or "").strip()
-        if cls.TOC_PATTERN.search(clean_line):
+        if not clean_line:
+            return False
+
+        compact = re.sub(r"\s+", "", clean_line)
+        if compact in {"目录", "目", "录"}:
             return True
-        return bool(re.search(r"\s{2,}\d{1,3}$", clean_line))
+
+        if not cls._looks_like_toc_candidate(clean_line):
+            return False
+
+        return cls._is_standard_catalog_name(clean_line)
+
+    @classmethod
+    def _looks_like_toc_candidate(cls, line: str) -> bool:
+        """只判断文本形态是否像目录行,不代表它已通过标准目录校验。"""
+
+        clean_line = str(line or "").strip()
+        if not clean_line:
+            return False
+        return bool(
+            cls.TOC_PATTERN.search(clean_line)
+            or re.search(r"\s{2,}\d{1,3}$", clean_line)
+        )
+
+    @classmethod
+    def _is_standard_catalog_name(cls, line: str) -> bool:
+        """目录候选去掉编号和页码后,必须命中标准目录名才当作目录行。"""
+
+        normalized = cls._normalize_catalog_name(line)
+        if not normalized:
+            return False
+
+        for chapter_title, section_titles in CATALOG_LIST.items():
+            if normalized == cls._normalize_catalog_name(chapter_title):
+                return True
+            if any(normalized == cls._normalize_catalog_name(section_title) for section_title in section_titles):
+                return True
+        return False
+
+    @classmethod
+    def _normalize_catalog_name(cls, text: str) -> str:
+        """去掉目录编号、页码、点线和空白,只保留用于标准目录比对的标题名。"""
+
+        cleaned = cls._strip_catalog_page_suffix(text)
+        cleaned = re.sub(r"\s+", " ", str(cleaned or "").strip())
+        if not cleaned:
+            return ""
+
+        prefix_patterns = (
+            r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部部分篇][\s、::.-]*",
+            r"^\d+(?:\.\d+)*(?:[\..、))\]]|\s+|(?=[\u4e00-\u9fa5]))",
+            r"^[一二三四五六七八九十百零两]+(?:[、))\]]|\s+|(?=[\u4e00-\u9fa5]))",
+            r"^[((]\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[))]\s*",
+            r"^[【\[]\s*\d+\s*[\]】]\s*",
+        )
+        for pattern in prefix_patterns:
+            cleaned = re.sub(pattern, "", cleaned, count=1).strip()
+
+        cleaned = re.sub(r"\s+", "", cleaned)
+        return cleaned.strip("::、..")
 
     @classmethod
     def _is_header_footer(cls, line: str) -> bool: