Просмотр исходного кода

dev:修复了目录识别兼容性问题;

ChenJiSheng 1 месяц назад
Родитель
Сommit
5f9542b880

+ 2 - 1
.gitignore

@@ -76,4 +76,5 @@ config/config.ini
 路桥/
 output/
 命令
-/core/construction_review/component/doc_worker/utils/llm_client copy.py
+/core/construction_review/component/doc_worker/utils/llm_client copy.py
+.venv/

+ 8 - 7
core/construction_review/component/doc_worker/config/config.yaml

@@ -114,14 +114,15 @@ header_footer_filter:
 # 目录识别配置
 toc_detection:
   # 目录行的正则模式(按优先级从高到低)
+  # 页码部分支持带修饰符号,如 ‐ 19 ‐,通过提取其中的数字来识别页码
   patterns:
-    - '^(第[一二三四五六七八九十\d]+[章节条款].+?)[.·]{2,}\s*(\d{1,4})\s*$'
-    - '^(【\d+】\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
-    - '^(〖\d+(?:\.\d+)*〗\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
-    - '^(\d+[、..]\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
-    - '^([一二三四五六七八九十]+[、..]\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
-    - '^(\d+(?:\.\d+)+\s*.+?)[.·]{2,}\s*(\d{1,4})\s*$'
-    - '^(.+?)[.·]{2,}\s*(\d{1,4})\s*$'
+    - '^(第[一二三四五六七八九十\d]+[章节条款].+?)[.·]{2,}\s*(.*?\d+.*?)\s*$'
+    - '^(【\d+】\s*.+?)[.·]{2,}\s*(.*?\d+.*?)\s*$'
+    - '^(〖\d+(?:\.\d+)*〗\s*.+?)[.·]{2,}\s*(.*?\d+.*?)\s*$'
+    - '^(\d+[、..]\s*.+?)[.·]{2,}\s*(.*?\d+.*?)\s*$'
+    - '^([一二三四五六七八九十]+[、..]\s*.+?)[.·]{2,}\s*(.*?\d+.*?)\s*$'
+    - '^(\d+(?:\.\d+)+\s*.+?)[.·]{2,}\s*(.*?\d+.*?)\s*$'
+    - '^(.+?)[.·]{2,}\s*(.*?\d+.*?)\s*$'
   
   # 标题长度限制
   min_length: 3

+ 3 - 3
core/construction_review/component/doc_worker/config/llm_api.yaml

@@ -16,9 +16,9 @@ doubao:
   DOUBAO_API_KEY: YOUR_DOUBAO_API_KEY_FOR_RAG_EVAL
 
 qwen:
-  QWEN_SERVER_URL: https://aqai.shudaodsj.com:22000/v1/
-  QWEN_MODEL_ID: Qwen/Qwen3-30B-A3B-Instruct-2507
-  QWEN_API_KEY: ms-9ad4a379-d592-4acd-b92c-8bac08a4a045
+  QWEN_SERVER_URL: http://192.168.91.253:8003/v1/
+  QWEN_MODEL_ID: qwen3-30b
+  QWEN_API_KEY: sk-123456
 
 keywords:
   timeout: 30

+ 3 - 2
core/construction_review/component/doc_worker/docx_worker/full_text_extractor.py

@@ -66,8 +66,9 @@ class DocxFullTextExtractor(FullTextExtractor):
                 # 段落元素
                 para = para_map[element]
                 text = para.text
-                # 过滤目录行:标题\t页码
-                if text and not re.match(r"^.+\t+\d+\s*$", text):
+                # 过滤目录行:标题\t页码(页码部分支持带修饰符号)
+                # 匹配从开头开始,包含制表符且末尾有数字的模式(目录行特征)
+                if text and not re.match(r"^.+\t+.*?\d+.*?\s*$", text):
                     all_elements.append(text)
             elif element in table_map:
                 # 表格元素

+ 13 - 3
core/construction_review/component/doc_worker/docx_worker/toc_extractor.py

@@ -14,17 +14,19 @@ from docx import Document
 
 from ..interfaces import TOCExtractor, DocumentSource
 from ..utils.toc_level_identifier import TOCLevelIdentifier
+from ..utils.toc_pattern_matcher import TOCPatternMatcher
 
 
 class DocxTOCExtractor(TOCExtractor):
     """DOCX 目录提取器"""
 
-    # 目录行模式:标题 + 制表符 + 页码
-    TOC_PATTERN = re.compile(r"^(?P<title>.+?)\t+(?P<page>\d+)\s*$")
+    # 目录行模式:标题 + 制表符 + 页码(页码部分支持带修饰符号,如 ‐ 19 ‐)
+    TOC_PATTERN = re.compile(r"^(?P<title>.+?)\t+(?P<page>.*?\d+.*?)\s*$")
 
     def __init__(self) -> None:
         """初始化 DOCX 目录提取器"""
         self._level_identifier = TOCLevelIdentifier()
+        self._page_extractor = TOCPatternMatcher()
 
     def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
         """
@@ -58,7 +60,15 @@ class DocxTOCExtractor(TOCExtractor):
             match = self.TOC_PATTERN.match(text)
             if match:
                 title = match.group("title").strip()
-                page = int(match.group("page"))
+                page_raw = match.group("page").strip()
+                
+                # 从可能带有修饰符号的页码中提取纯数字
+                page_num_str = self._page_extractor.extract_page_number(page_raw)
+                try:
+                    page = int(page_num_str)
+                except ValueError:
+                    # 如果无法转换为整数,跳过该项
+                    continue
                 
                 # 先不设置层级,后续统一识别
                 toc_items.append({

+ 23 - 2
core/construction_review/component/doc_worker/utils/toc_pattern_matcher.py

@@ -19,6 +19,23 @@ class TOCPatternMatcher:
     def __init__(self) -> None:
         self._cfg = default_config_provider
 
+    @staticmethod
+    def extract_page_number(page_str: str) -> str:
+        """
+        从可能带有修饰符号的页码字符串中提取纯数字。
+        
+        例如:
+        - '‐ 1 ‐' -> '1'
+        - '19' -> '19'
+        - ' 10 ' -> '10'
+        - '‐ 19 ‐' -> '19'
+        """
+        # 使用正则表达式提取第一个连续的数字序列
+        match = re.search(r'\d+', page_str)
+        if match:
+            return match.group(0)
+        return page_str.strip()  # 如果没有找到数字,返回清理后的原始字符串
+
     def has_numbering(self, text: str) -> bool:
         """检查文本是否包含编号格式。"""
         numbering_patterns: List[str] = self._cfg.get("numbering.formats", [])
@@ -40,7 +57,8 @@ class TOCPatternMatcher:
             if re.match(r"^第[一二三四五六七八九十\d]+[章节条款]\s*$", line):
                 if i + 1 < len(lines):
                     next_line = lines[i + 1].strip()
-                    if re.search(r"[.·]{2,}.*\d{1,4}\s*$", next_line):
+                    # 支持带修饰符号的页码匹配
+                    if re.search(r"[.·]{2,}.*?\d+.*?\s*$", next_line):
                         merged_line = line + next_line
                         merged_lines.append(merged_line)
                         i += 2
@@ -72,7 +90,10 @@ class TOCPatternMatcher:
                     continue
 
                 title = match.group(1).strip()
-                page_num = match.group(2).strip()
+                page_num_raw = match.group(2).strip()
+                
+                # 从可能带有修饰符号的页码中提取纯数字
+                page_num = self.extract_page_number(page_num_raw)
 
                 title_clean = re.sub(r"[.·]{2,}", "", title)
                 title_clean = re.sub(r"\s{2,}", " ", title_clean)

+ 1 - 1
core/construction_review/component/doc_worker/命令

@@ -1,5 +1,5 @@
 python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝(四川境)高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output
-python -m file_parse.pdf_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝(四川境)高速公路项目土建项目ZCB1-3合同段项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
+python -m core.construction_review.component.doc_worker.pdf_worker.cli "E:\LLM\dev_v1\files\7a88f0d5-9d82-43bf-b2b1-c2924d67477e.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output