|
@@ -17,18 +17,8 @@ import numpy as np
|
|
|
|
|
|
|
|
from foundation.observability.logger.loggering import review_logger as logger
|
|
from foundation.observability.logger.loggering import review_logger as logger
|
|
|
|
|
|
|
|
-# 尝试导入 YOLO 相关库
|
|
|
|
|
-try:
|
|
|
|
|
- from ultralytics import YOLO
|
|
|
|
|
- YOLO_AVAILABLE = True
|
|
|
|
|
-except ImportError:
|
|
|
|
|
- YOLO_AVAILABLE = False
|
|
|
|
|
-
|
|
|
|
|
-try:
|
|
|
|
|
- from PIL import Image
|
|
|
|
|
- PIL_AVAILABLE = True
|
|
|
|
|
-except ImportError:
|
|
|
|
|
- PIL_AVAILABLE = False
|
|
|
|
|
|
|
+from ultralytics import YOLO
|
|
|
|
|
+from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
@dataclass
|
|
@@ -94,26 +84,15 @@ class TOCCatalogExtractor:
|
|
|
self.ocr_timeout = ocr_timeout
|
|
self.ocr_timeout = ocr_timeout
|
|
|
|
|
|
|
|
self._model = None
|
|
self._model = None
|
|
|
- self._yolo_available = YOLO_AVAILABLE and PIL_AVAILABLE
|
|
|
|
|
|
|
|
|
|
def _load_model(self) -> bool:
|
|
def _load_model(self) -> bool:
|
|
|
- """加载 YOLO 模型"""
|
|
|
|
|
- if not self._yolo_available:
|
|
|
|
|
- logger.debug("[TOC检测] YOLO库未安装,跳过目录检测")
|
|
|
|
|
- return False
|
|
|
|
|
-
|
|
|
|
|
|
|
+ """加载 YOLO 模型,缺少依赖或模型文件直接报错"""
|
|
|
if not os.path.exists(self.model_path):
|
|
if not os.path.exists(self.model_path):
|
|
|
- logger.debug(f"[TOC检测] 模型文件不存在: {self.model_path}")
|
|
|
|
|
- return False
|
|
|
|
|
|
|
+ raise FileNotFoundError(f"[TOC检测] YOLO模型文件不存在: {self.model_path}")
|
|
|
|
|
|
|
|
if self._model is None:
|
|
if self._model is None:
|
|
|
- try:
|
|
|
|
|
- logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
|
|
|
|
|
- self._model = YOLO(self.model_path)
|
|
|
|
|
- return True
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.warning(f"[TOC检测] 模型加载失败: {e}")
|
|
|
|
|
- return False
|
|
|
|
|
|
|
+ logger.info(f"[TOC检测] 正在加载YOLO模型: {self.model_path}")
|
|
|
|
|
+ self._model = YOLO(self.model_path)
|
|
|
return True
|
|
return True
|
|
|
|
|
|
|
|
def detect_and_extract(
|
|
def detect_and_extract(
|
|
@@ -361,7 +340,6 @@ class TOCCatalogExtractor:
|
|
|
force_smaller: 是否强制更小的尺寸(用于处理过大的图片)
|
|
force_smaller: 是否强制更小的尺寸(用于处理过大的图片)
|
|
|
"""
|
|
"""
|
|
|
try:
|
|
try:
|
|
|
- from PIL import Image
|
|
|
|
|
img = Image.open(io.BytesIO(img_bytes))
|
|
img = Image.open(io.BytesIO(img_bytes))
|
|
|
|
|
|
|
|
if img.mode in ('RGBA', 'LA', 'P'):
|
|
if img.mode in ('RGBA', 'LA', 'P'):
|
|
@@ -396,15 +374,20 @@ class TOCCatalogExtractor:
|
|
|
|
|
|
|
|
def _parse_toc_text(self, text: str) -> Dict[str, Any]:
|
|
def _parse_toc_text(self, text: str) -> Dict[str, Any]:
|
|
|
"""
|
|
"""
|
|
|
- 解析目录文本为结构化数据
|
|
|
|
|
|
|
+ 解析目录文本为结构化数据,输出标准格式
|
|
|
|
|
|
|
|
- 支持格式:
|
|
|
|
|
- - 第一章 XXX...................1
|
|
|
|
|
- - 一、XXX......................2
|
|
|
|
|
- - 1. XXX ......................3
|
|
|
|
|
|
|
+ 标准格式:
|
|
|
|
|
+ 第X章 XXX
|
|
|
|
|
+ 一、XXX
|
|
|
|
|
+ 二、XXX
|
|
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
|
- {"chapters": [...], "total_chapters": N}
|
|
|
|
|
|
|
+ {
|
|
|
|
|
+ "chapters": [...],
|
|
|
|
|
+ "total_chapters": N,
|
|
|
|
|
+ "raw_ocr_text": "原始OCR文本",
|
|
|
|
|
+ "formatted_text": "标准格式文本"
|
|
|
|
|
+ }
|
|
|
"""
|
|
"""
|
|
|
lines = text.strip().split('\n')
|
|
lines = text.strip().split('\n')
|
|
|
chapters = []
|
|
chapters = []
|
|
@@ -422,6 +405,13 @@ class TOCCatalogExtractor:
|
|
|
r'([0-9]+)[\.\s]+(.+?)\s*[\.\s]+(\d+)\s*$'
|
|
r'([0-9]+)[\.\s]+(.+?)\s*[\.\s]+(\d+)\s*$'
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
|
|
+ # 中文数字映射
|
|
|
|
|
+ chinese_nums = {
|
|
|
|
|
+ '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
|
|
|
|
|
+ '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
|
|
|
|
+ '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
for line in lines:
|
|
for line in lines:
|
|
|
line = line.strip()
|
|
line = line.strip()
|
|
|
if not line or len(line) < 3:
|
|
if not line or len(line) < 3:
|
|
@@ -442,24 +432,37 @@ class TOCCatalogExtractor:
|
|
|
if current_chapter:
|
|
if current_chapter:
|
|
|
chapters.append(current_chapter)
|
|
chapters.append(current_chapter)
|
|
|
|
|
|
|
|
|
|
+ # 标准化为阿拉伯数字
|
|
|
|
|
+ if chapter_num.isdigit():
|
|
|
|
|
+ idx = int(chapter_num)
|
|
|
|
|
+ else:
|
|
|
|
|
+ idx = chinese_nums.get(chapter_num, len(chapters) + 1)
|
|
|
|
|
+
|
|
|
current_chapter = {
|
|
current_chapter = {
|
|
|
- "index": self._chinese_to_number(chapter_num) if not chapter_num.isdigit() else int(chapter_num),
|
|
|
|
|
- "title": f"第{chapter_num}章 {title}",
|
|
|
|
|
|
|
+ "index": idx,
|
|
|
|
|
+ "title": f"第{idx}章 {title}",
|
|
|
"page": page,
|
|
"page": page,
|
|
|
"original": line,
|
|
"original": line,
|
|
|
"subsections": []
|
|
"subsections": []
|
|
|
}
|
|
}
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- # 尝试匹配节(二级)
|
|
|
|
|
|
|
+ # 尝试匹配节(二级)- 标准化为一、二、三格式
|
|
|
section_match = section_pattern.search(line)
|
|
section_match = section_pattern.search(line)
|
|
|
if section_match and current_chapter:
|
|
if section_match and current_chapter:
|
|
|
section_num = section_match.group(1)
|
|
section_num = section_match.group(1)
|
|
|
title = section_match.group(2).strip()
|
|
title = section_match.group(2).strip()
|
|
|
page = section_match.group(3).strip()
|
|
page = section_match.group(3).strip()
|
|
|
|
|
|
|
|
|
|
+ # 标准化节编号
|
|
|
|
|
+ if section_num.isdigit():
|
|
|
|
|
+ section_idx = int(section_num)
|
|
|
|
|
+ section_cn = self._number_to_chinese(section_idx)
|
|
|
|
|
+ else:
|
|
|
|
|
+ section_cn = section_num
|
|
|
|
|
+
|
|
|
current_chapter["subsections"].append({
|
|
current_chapter["subsections"].append({
|
|
|
- "title": f"{section_num}、{title}",
|
|
|
|
|
|
|
+ "title": f"{section_cn}、{title}",
|
|
|
"page": page,
|
|
"page": page,
|
|
|
"level": 2,
|
|
"level": 2,
|
|
|
"original": line
|
|
"original": line
|
|
@@ -472,23 +475,25 @@ class TOCCatalogExtractor:
|
|
|
title = generic_match.group(2).strip()
|
|
title = generic_match.group(2).strip()
|
|
|
page = generic_match.group(3).strip()
|
|
page = generic_match.group(3).strip()
|
|
|
|
|
|
|
|
- # 判断是章还是节(根据缩进或内容特征)
|
|
|
|
|
|
|
+ # 判断是章还是节(根据内容特征)
|
|
|
if any(kw in title for kw in ['编制依据', '工程概况', '施工计划', '施工工艺',
|
|
if any(kw in title for kw in ['编制依据', '工程概况', '施工计划', '施工工艺',
|
|
|
'安全保证', '质量保证', '环境保证', '人员配备',
|
|
'安全保证', '质量保证', '环境保证', '人员配备',
|
|
|
'验收要求']):
|
|
'验收要求']):
|
|
|
- # 可能是章标题(没有"第X章"前缀的变体)
|
|
|
|
|
chapters.append(current_chapter)
|
|
chapters.append(current_chapter)
|
|
|
|
|
+ idx = len(chapters) + 1
|
|
|
current_chapter = {
|
|
current_chapter = {
|
|
|
- "index": len(chapters) + 1,
|
|
|
|
|
- "title": title,
|
|
|
|
|
|
|
+ "index": idx,
|
|
|
|
|
+ "title": f"第{idx}章 {title}",
|
|
|
"page": page,
|
|
"page": page,
|
|
|
"original": line,
|
|
"original": line,
|
|
|
"subsections": []
|
|
"subsections": []
|
|
|
}
|
|
}
|
|
|
else:
|
|
else:
|
|
|
- # 作为节
|
|
|
|
|
|
|
+ # 作为节,自动编号
|
|
|
|
|
+ section_idx = len(current_chapter["subsections"]) + 1
|
|
|
|
|
+ section_cn = self._number_to_chinese(section_idx)
|
|
|
current_chapter["subsections"].append({
|
|
current_chapter["subsections"].append({
|
|
|
- "title": title,
|
|
|
|
|
|
|
+ "title": f"{section_cn}、{title}",
|
|
|
"page": page,
|
|
"page": page,
|
|
|
"level": 2,
|
|
"level": 2,
|
|
|
"original": line
|
|
"original": line
|
|
@@ -502,17 +507,33 @@ class TOCCatalogExtractor:
|
|
|
if not chapters and lines:
|
|
if not chapters and lines:
|
|
|
chapters = self._fallback_parse(lines)
|
|
chapters = self._fallback_parse(lines)
|
|
|
|
|
|
|
|
|
|
+ # 构建标准格式文本
|
|
|
|
|
+ formatted_lines = []
|
|
|
|
|
+ for ch in chapters:
|
|
|
|
|
+ formatted_lines.append(ch["title"])
|
|
|
|
|
+ for sub in ch.get("subsections", []):
|
|
|
|
|
+ formatted_lines.append(f" {sub['title']}")
|
|
|
|
|
+
|
|
|
|
|
+ formatted_text = "\n".join(formatted_lines)
|
|
|
|
|
+
|
|
|
|
|
+ # 日志输出完整的目录解析结果
|
|
|
|
|
+ logger.info(f"[TOC解析] 共 {len(chapters)} 章,标准格式文本:\n{formatted_text}")
|
|
|
|
|
+
|
|
|
return {
|
|
return {
|
|
|
"chapters": chapters,
|
|
"chapters": chapters,
|
|
|
- "total_chapters": len(chapters)
|
|
|
|
|
|
|
+ "total_chapters": len(chapters),
|
|
|
|
|
+ "raw_ocr_text": text,
|
|
|
|
|
+ "formatted_text": formatted_text
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
def _fallback_parse(self, lines: List[str]) -> List[Dict[str, Any]]:
|
|
def _fallback_parse(self, lines: List[str]) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
"""
|
|
|
降级解析策略:当正则无法匹配时使用启发式方法
|
|
降级解析策略:当正则无法匹配时使用启发式方法
|
|
|
|
|
+ 输出标准格式:第X章 XXX / 一、XXX
|
|
|
"""
|
|
"""
|
|
|
chapters = []
|
|
chapters = []
|
|
|
idx = 0
|
|
idx = 0
|
|
|
|
|
+ section_idx = 0
|
|
|
|
|
|
|
|
for line in lines:
|
|
for line in lines:
|
|
|
line = line.strip()
|
|
line = line.strip()
|
|
@@ -530,22 +551,26 @@ class TOCCatalogExtractor:
|
|
|
# 根据内容特征判断层级
|
|
# 根据内容特征判断层级
|
|
|
is_chapter = any(kw in title for kw in ['编制依据', '工程概况', '施工计划',
|
|
is_chapter = any(kw in title for kw in ['编制依据', '工程概况', '施工计划',
|
|
|
'施工工艺', '安全保证', '质量保证',
|
|
'施工工艺', '安全保证', '质量保证',
|
|
|
- '环境保证', '人员配备', '验收'])
|
|
|
|
|
|
|
+ '环境保证', '人员配备', '验收',
|
|
|
|
|
+ '其他资料'])
|
|
|
|
|
|
|
|
if is_chapter or len(chapters) == 0:
|
|
if is_chapter or len(chapters) == 0:
|
|
|
idx += 1
|
|
idx += 1
|
|
|
|
|
+ section_idx = 0 # 重置节计数
|
|
|
chapters.append({
|
|
chapters.append({
|
|
|
"index": idx,
|
|
"index": idx,
|
|
|
- "title": title,
|
|
|
|
|
|
|
+ "title": f"第{idx}章 {title}",
|
|
|
"page": page,
|
|
"page": page,
|
|
|
"original": line,
|
|
"original": line,
|
|
|
"subsections": []
|
|
"subsections": []
|
|
|
})
|
|
})
|
|
|
else:
|
|
else:
|
|
|
- # 作为上一章的节
|
|
|
|
|
|
|
+ # 作为上一章的节,使用标准格式 一、二、三
|
|
|
if chapters:
|
|
if chapters:
|
|
|
|
|
+ section_idx += 1
|
|
|
|
|
+ section_cn = self._number_to_chinese(section_idx)
|
|
|
chapters[-1]["subsections"].append({
|
|
chapters[-1]["subsections"].append({
|
|
|
- "title": title,
|
|
|
|
|
|
|
+ "title": f"{section_cn}、{title}",
|
|
|
"page": page,
|
|
"page": page,
|
|
|
"level": 2,
|
|
"level": 2,
|
|
|
"original": line
|
|
"original": line
|
|
@@ -553,14 +578,14 @@ class TOCCatalogExtractor:
|
|
|
|
|
|
|
|
return chapters
|
|
return chapters
|
|
|
|
|
|
|
|
- def _chinese_to_number(self, chinese: str) -> int:
|
|
|
|
|
- """中文数字转阿拉伯数字"""
|
|
|
|
|
|
|
+ def _number_to_chinese(self, num: int) -> str:
|
|
|
|
|
+ """阿拉伯数字转中文数字"""
|
|
|
chinese_nums = {
|
|
chinese_nums = {
|
|
|
- '一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
|
|
|
|
|
- '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
|
|
|
|
|
- '十一': 11, '十二': 12
|
|
|
|
|
|
|
+ 1: '一', 2: '二', 3: '三', 4: '四', 5: '五',
|
|
|
|
|
+ 6: '六', 7: '七', 8: '八', 9: '九', 10: '十',
|
|
|
|
|
+ 11: '十一', 12: '十二', 13: '十三', 14: '十四', 15: '十五'
|
|
|
}
|
|
}
|
|
|
- return chinese_nums.get(chinese, 0)
|
|
|
|
|
|
|
+ return chinese_nums.get(num, str(num))
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_catalog_from_pdf(
|
|
def extract_catalog_from_pdf(
|