|
|
@@ -23,7 +23,18 @@ EMPTY_SECTION_PLACEHOLDER = "[本节无纯文本,原文档中可能为纯图
|
|
|
TABLE_OCR_START = "[表格OCR识别结果]:"
|
|
|
TABLE_OCR_END = "[/表格]"
|
|
|
CN_LIST_L1_NUMERIC_L2_RULE = "Rule_8_中文序号章数字小节派"
|
|
|
-
|
|
|
+CATALOG_LIST = {
|
|
|
+ '编制依据':['法律法规','标准范围','文件制度','编制原则','编制范围'],
|
|
|
+ '工程概况':['设计概况','工程地质与水文气象','周边环境','施工平面及立面布置','施工要求和技术保证条件','风险辨别与分级','参建各方责任主体单位'],
|
|
|
+ '施工计划':['施工进度计划','施工材料计划','施工设备计划','劳动力计划','安全生产费用使用计划'],
|
|
|
+ '施工工艺技术':['主要施工方法概述','技术参数','施工准备','工艺流程','施工方法及操作要求','检查要求'],
|
|
|
+ '安全保证措施':['安全保证体系','组织保证措施','技术保证措施','监测监控措施','应急处置措施'],
|
|
|
+ '质量保证措施':['质量保证体系','质量目标','工程创优规划','质量控制程序与具体措施'],
|
|
|
+ '环境保证措施':['环境保证体系','环境保护组织机构','环境保护及文明施工措施'],
|
|
|
+ '施工管理及作业人员配备与分工':['施工管理人员','专职安全生产管理人员','特种作业人员','其它作业人员'],
|
|
|
+ '验收要求':['验收标准','验收程序','验收内容','验收时间','验收人员'],
|
|
|
+ '其它资料':['计算书','相关施工图纸','附图附表','编制及审核人员情况']
|
|
|
+}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
|
class BodyLine:
|
|
|
@@ -786,9 +797,10 @@ class PdfStructureExtractor:
|
|
|
continue
|
|
|
|
|
|
has_toc = self._is_toc_line(line)
|
|
|
+ has_unconfirmed_toc = self._looks_like_toc_candidate(line) and not has_toc
|
|
|
|
|
|
match_l1 = rule_set["l1"].match(line)
|
|
|
- if match_l1 and not has_toc:
|
|
|
+ if match_l1 and not has_toc and not has_unconfirmed_toc:
|
|
|
core_text = self._blind_strip(line)
|
|
|
if len(core_text) < 2:
|
|
|
pending_prefix = line
|
|
|
@@ -858,7 +870,7 @@ class PdfStructureExtractor:
|
|
|
chapter_l2_style_hint = self._detect_cn_order_l2_style(line)
|
|
|
|
|
|
match_l2 = rule_set["l2"].match(line)
|
|
|
- if current_l1 and match_l2 and not has_toc:
|
|
|
+ if current_l1 and match_l2 and not has_toc and not has_unconfirmed_toc:
|
|
|
if self._is_valid_heading_strict(line, is_l1=False):
|
|
|
if is_numeric_l2:
|
|
|
l2_main_num = int(match_l2.group(1))
|
|
|
@@ -1417,9 +1429,66 @@ class PdfStructureExtractor:
|
|
|
"""判断一行文本是否像目录行。"""
|
|
|
|
|
|
clean_line = str(line or "").strip()
|
|
|
- if cls.TOC_PATTERN.search(clean_line):
|
|
|
+ if not clean_line:
|
|
|
+ return False
|
|
|
+
|
|
|
+ compact = re.sub(r"\s+", "", clean_line)
|
|
|
+ if compact in {"目录", "目", "录"}:
|
|
|
return True
|
|
|
- return bool(re.search(r"\s{2,}\d{1,3}$", clean_line))
|
|
|
+
|
|
|
+ if not cls._looks_like_toc_candidate(clean_line):
|
|
|
+ return False
|
|
|
+
|
|
|
+ return cls._is_standard_catalog_name(clean_line)
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _looks_like_toc_candidate(cls, line: str) -> bool:
|
|
|
+ """只判断文本形态是否像目录行,不代表它已通过标准目录校验。"""
|
|
|
+
|
|
|
+ clean_line = str(line or "").strip()
|
|
|
+ if not clean_line:
|
|
|
+ return False
|
|
|
+ return bool(
|
|
|
+ cls.TOC_PATTERN.search(clean_line)
|
|
|
+ or re.search(r"\s{2,}\d{1,3}$", clean_line)
|
|
|
+ )
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _is_standard_catalog_name(cls, line: str) -> bool:
|
|
|
+ """目录候选去掉编号和页码后,必须命中标准目录名才当作目录行。"""
|
|
|
+
|
|
|
+ normalized = cls._normalize_catalog_name(line)
|
|
|
+ if not normalized:
|
|
|
+ return False
|
|
|
+
|
|
|
+ for chapter_title, section_titles in CATALOG_LIST.items():
|
|
|
+ if normalized == cls._normalize_catalog_name(chapter_title):
|
|
|
+ return True
|
|
|
+ if any(normalized == cls._normalize_catalog_name(section_title) for section_title in section_titles):
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _normalize_catalog_name(cls, text: str) -> str:
|
|
|
+ """去掉目录编号、页码、点线和空白,只保留用于标准目录比对的标题名。"""
|
|
|
+
|
|
|
+ cleaned = cls._strip_catalog_page_suffix(text)
|
|
|
+ cleaned = re.sub(r"\s+", " ", str(cleaned or "").strip())
|
|
|
+ if not cleaned:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ prefix_patterns = (
|
|
|
+ r"^第\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[章节部部分篇][\s、::.-]*",
|
|
|
+ r"^\d+(?:\.\d+)*(?:[\..、))\]]|\s+|(?=[\u4e00-\u9fa5]))",
|
|
|
+ r"^[一二三四五六七八九十百零两]+(?:[、))\]]|\s+|(?=[\u4e00-\u9fa5]))",
|
|
|
+ r"^[((]\s*(?:\d+|[一二三四五六七八九十百零两]+)\s*[))]\s*",
|
|
|
+ r"^[【\[]\s*\d+\s*[\]】]\s*",
|
|
|
+ )
|
|
|
+ for pattern in prefix_patterns:
|
|
|
+ cleaned = re.sub(pattern, "", cleaned, count=1).strip()
|
|
|
+
|
|
|
+ cleaned = re.sub(r"\s+", "", cleaned)
|
|
|
+ return cleaned.strip("::、..")
|
|
|
|
|
|
@classmethod
|
|
|
def _is_header_footer(cls, line: str) -> bool:
|