Răsfoiți Sursa

解决冲突

WangXuMing 2 luni în urmă
părinte
comite
ee18288171
24 a modificat fișierele cu 155 adăugiri și 93 ștergeri
  1. 8 8
      core/construction_review/component/doc_worker/__init__.py
  2. 9 0
      core/construction_review/component/doc_worker/chunking/__init__.py
  3. 0 0
      core/construction_review/component/doc_worker/chunking/chunk_merger.py
  4. 55 7
      core/construction_review/component/doc_worker/chunking/chunk_metadata.py
  5. 2 2
      core/construction_review/component/doc_worker/chunking/chunk_splitter.py
  6. 0 0
      core/construction_review/component/doc_worker/chunking/hierarchy_processor.py
  7. 2 2
      core/construction_review/component/doc_worker/chunking/text_splitter.py
  8. 2 2
      core/construction_review/component/doc_worker/chunking/text_utils.py
  9. 2 2
      core/construction_review/component/doc_worker/chunking/title_matcher.py
  10. 9 0
      core/construction_review/component/doc_worker/classification/__init__.py
  11. 2 2
      core/construction_review/component/doc_worker/classification/llm_classifier.py
  12. 9 0
      core/construction_review/component/doc_worker/config/__init__.py
  13. 0 0
      core/construction_review/component/doc_worker/config/config.yaml
  14. 1 0
      core/construction_review/component/doc_worker/config/config_loader.py
  15. 10 10
      core/construction_review/component/doc_worker/core.py
  16. 9 0
      core/construction_review/component/doc_worker/output/__init__.py
  17. 2 2
      core/construction_review/component/doc_worker/output/result_saver.py
  18. 9 0
      core/construction_review/component/doc_worker/toc/__init__.py
  19. 2 2
      core/construction_review/component/doc_worker/toc/document_extractor_toc.py
  20. 2 2
      core/construction_review/component/doc_worker/toc/toc_extractor.py
  21. 2 2
      core/construction_review/component/doc_worker/toc/toc_level_identifier.py
  22. 2 2
      core/construction_review/component/doc_worker/toc/toc_pattern_matcher.py
  23. 4 4
      core/construction_review/component/document_processor.py
  24. 12 44
      temp/AI审查结果.json

+ 8 - 8
core/base/doc_worker/__init__.py → core/construction_review/component/doc_worker/__init__.py

@@ -29,16 +29,16 @@ __author__ = "Your Name"
 
 try:
     from .core import DocumentClassifier
-    from .toc_extractor import TOCExtractor
-    from .text_splitter import TextSplitter
-    from .llm_classifier import LLMClassifier
-    from .result_saver import ResultSaver
+    from .toc.toc_extractor import TOCExtractor
+    from .chunking.text_splitter import TextSplitter
+    from .classification.llm_classifier import LLMClassifier
+    from .output.result_saver import ResultSaver
 except ImportError:
     from core import DocumentClassifier
-    from toc_extractor import TOCExtractor
-    from text_splitter import TextSplitter
-    from llm_classifier import LLMClassifier
-    from result_saver import ResultSaver
+    from toc.toc_extractor import TOCExtractor
+    from chunking.text_splitter import TextSplitter
+    from classification.llm_classifier import LLMClassifier
+    from output.result_saver import ResultSaver
 
 __all__ = [
     'DocumentClassifier',

+ 9 - 0
core/construction_review/component/doc_worker/chunking/__init__.py

@@ -0,0 +1,9 @@
+"""
+文本切分模块
+"""
+
+from .text_splitter import TextSplitter
+
+__all__ = ['TextSplitter']
+
+

+ 0 - 0
core/base/doc_worker/chunk_merger.py → core/construction_review/component/doc_worker/chunking/chunk_merger.py


+ 55 - 7
core/base/doc_worker/chunk_metadata.py → core/construction_review/component/doc_worker/chunking/chunk_metadata.py

@@ -96,7 +96,7 @@ class ChunkMetadata:
     
     def finalize_chunk_ids(self, chunks):
         """
-        生成最终的chunk_id和serial_number(复用测试目录的逻辑)
+        生成最终的chunk_id和serial_number
         
         参数:
             chunks: 合并后的块列表
@@ -111,6 +111,7 @@ class ChunkMetadata:
         for i, chunk in enumerate(chunks):
             title_number = chunk.get('_title_number', '')
             is_merged = chunk.get('_is_merged', False)
+            section_label = chunk.get('section_label', '')
             
             # 提取标题编号的主要部分(用于判断是否在同一标题内)
             # 如果包含+号,说明是跨标题合并的块
@@ -119,8 +120,6 @@ class ChunkMetadata:
                 local_index = 0
                 # chunk_id中使用+号(无空格),如"1.5+1.6"
                 merged_title_number = title_number
-                # serial_number中使用空格,如"1.5 + 1.6"
-                serial_number_display = chunk.get('_title_number_display', title_number.replace('+', ' + '))
                 # 更新current_title_number为合并后的编号,这样下一个块会重新开始
                 current_title_number = title_number
             else:
@@ -133,14 +132,22 @@ class ChunkMetadata:
                 else:
                     local_index += 1
                 merged_title_number = title_number
-                serial_number_display = title_number
             
-            # 生成chunk_id(使用无空格的编号)
-            if merged_title_number:
+            # 从section_label中提取标题路径的编号路径(用于chunk_id)
+            title_number_path = self._extract_title_number_path(section_label)
+            
+            # 生成chunk_id:doc_chunk_<标题路径的编号路径>_序号
+            if title_number_path:
+                chunk_id_str = f"doc_chunk_{title_number_path}_{local_index}"
+            elif merged_title_number:
+                # 如果没有完整的编号路径,使用合并后的编号(向后兼容)
                 chunk_id_str = f"doc_chunk_{merged_title_number}_{local_index}"
             else:
                 chunk_id_str = f"doc_chunk_{local_index}"
             
+            # 从section_label中提取最底层级的编号(用于serial_number)
+            serial_number = self.text_utils.extract_number_from_section_label(section_label)
+            
             # 更新chunk数据
             final_chunk = {
                 'file_name': chunk['file_name'],
@@ -150,7 +157,7 @@ class ChunkMetadata:
                 'element_tag': {
                     'chunk_id': chunk_id_str,
                     'page': chunk['element_tag']['page'],
-                    'serial_number': serial_number_display if merged_title_number else ''
+                    'serial_number': serial_number
                 },
                 'review_chunk_content': chunk['review_chunk_content']
             }
@@ -248,6 +255,47 @@ class ChunkMetadata:
         
         return ""
     
+    def _extract_title_number_path(self, section_label):
+        """
+        从section_label中提取标题路径的编号路径
+        
+        例如:
+        "第一章 工程概况->【1】工程概况->1.1 项目总体概况" -> "1->【1】->1.1"
+        "第三章 施工计划->【2】机械设备计划" -> "3->【2】"
+        "第一章 工程概况->【2】自然条件->2.1 气象情况" -> "1->【2】->2.1"
+        
+        参数:
+            section_label: section_label字符串,格式为 "一级->二级->三级"
+            
+        返回:
+            str: 编号路径,用"->"连接,如果未找到则返回空字符串
+        """
+        if not section_label:
+            return ""
+        
+        # 处理合并的情况(用" + "连接),取第一部分
+        if ' + ' in section_label:
+            section_label = section_label.split(' + ')[0]
+        
+        # 按"->"分割层级路径
+        parts = section_label.split('->')
+        
+        # 提取每一层的编号
+        number_paths = []
+        for part in parts:
+            part = part.strip()
+            if part:
+                # 使用text_utils的extract_title_number方法提取编号
+                number = self.text_utils.extract_title_number(part)
+                if number:
+                    number_paths.append(number)
+        
+        # 用"->"连接编号路径
+        if number_paths:
+            return '->'.join(number_paths)
+        
+        return ""
+    
     def build_hierarchy_path(self, title, all_toc_items, target_level):
         """
         构建从1级到当前标题的完整层级路径

+ 2 - 2
core/base/doc_worker/chunk_splitter.py → core/construction_review/component/doc_worker/chunking/chunk_splitter.py

@@ -6,10 +6,10 @@
 import re
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
     from .title_matcher import TitleMatcher
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
     from title_matcher import TitleMatcher
 
 

+ 0 - 0
core/base/doc_worker/hierarchy_processor.py → core/construction_review/component/doc_worker/chunking/hierarchy_processor.py


+ 2 - 2
core/base/doc_worker/text_splitter.py → core/construction_review/component/doc_worker/chunking/text_splitter.py

@@ -10,7 +10,7 @@ import fitz  # PyMuPDF
 from docx import Document
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
     from .title_matcher import TitleMatcher
     from .text_utils import TextUtils
     from .chunk_splitter import ChunkSplitter
@@ -18,7 +18,7 @@ try:
     from .chunk_metadata import ChunkMetadata
     from .hierarchy_processor import HierarchyProcessor
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
     from title_matcher import TitleMatcher
     from text_utils import TextUtils
     from chunk_splitter import ChunkSplitter

+ 2 - 2
core/base/doc_worker/text_utils.py → core/construction_review/component/doc_worker/chunking/text_utils.py

@@ -6,9 +6,9 @@
 import re
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
 
 
 class TextUtils:

+ 2 - 2
core/base/doc_worker/title_matcher.py → core/construction_review/component/doc_worker/chunking/title_matcher.py

@@ -7,9 +7,9 @@ import re
 from difflib import SequenceMatcher
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
 
 
 class TitleMatcher:

+ 9 - 0
core/construction_review/component/doc_worker/classification/__init__.py

@@ -0,0 +1,9 @@
+"""
+分类模块
+"""
+
+from .llm_classifier import LLMClassifier
+
+__all__ = ['LLMClassifier']
+
+

+ 2 - 2
core/base/doc_worker/llm_classifier.py → core/construction_review/component/doc_worker/classification/llm_classifier.py

@@ -6,9 +6,9 @@
 import re
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
 
 
 class LLMClassifier:

+ 9 - 0
core/construction_review/component/doc_worker/config/__init__.py

@@ -0,0 +1,9 @@
+"""
+配置模块
+"""
+
+from .config_loader import get_config, Config
+
+__all__ = ['get_config', 'Config']
+
+

+ 0 - 0
core/base/doc_worker/config.yaml → core/construction_review/component/doc_worker/config/config.yaml


+ 1 - 0
core/base/doc_worker/config_loader.py → core/construction_review/component/doc_worker/config/config_loader.py

@@ -32,6 +32,7 @@ class Config:
             config_path: 配置文件路径,默认为当前目录下的config.yaml
         """
         if config_path is None:
+            # config.yaml 现在在同一目录下
             config_path = Path(__file__).parent / 'config.yaml'
         else:
             config_path = Path(config_path)

+ 10 - 10
core/base/doc_worker/core.py → core/construction_review/component/doc_worker/core.py

@@ -7,17 +7,17 @@ from pathlib import Path
 from collections import Counter
 
 try:
-    from .toc_extractor import TOCExtractor
-    from .llm_classifier import LLMClassifier
-    from .text_splitter import TextSplitter
-    from .result_saver import ResultSaver
-    from .config_loader import get_config
+    from .toc.toc_extractor import TOCExtractor
+    from .classification.llm_classifier import LLMClassifier
+    from .chunking.text_splitter import TextSplitter
+    from .output.result_saver import ResultSaver
+    from .config.config_loader import get_config
 except ImportError:
-    from toc_extractor import TOCExtractor
-    from llm_classifier import LLMClassifier
-    from text_splitter import TextSplitter
-    from result_saver import ResultSaver
-    from config_loader import get_config
+    from toc.toc_extractor import TOCExtractor
+    from classification.llm_classifier import LLMClassifier
+    from chunking.text_splitter import TextSplitter
+    from output.result_saver import ResultSaver
+    from config.config_loader import get_config
 
 
 class DocumentClassifier:

+ 9 - 0
core/construction_review/component/doc_worker/output/__init__.py

@@ -0,0 +1,9 @@
+"""
+输出模块
+"""
+
+from .result_saver import ResultSaver
+
+__all__ = ['ResultSaver']
+
+

+ 2 - 2
core/base/doc_worker/result_saver.py → core/construction_review/component/doc_worker/output/result_saver.py

@@ -9,9 +9,9 @@ from datetime import datetime
 from collections import defaultdict, Counter
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
 
 
 class ResultSaver:

+ 9 - 0
core/construction_review/component/doc_worker/toc/__init__.py

@@ -0,0 +1,9 @@
+"""
+目录提取模块
+"""
+
+from .toc_extractor import TOCExtractor
+
+__all__ = ['TOCExtractor']
+
+

+ 2 - 2
core/base/doc_worker/document_extractor_toc.py → core/construction_review/component/doc_worker/toc/document_extractor_toc.py

@@ -9,10 +9,10 @@ import fitz  # PyMuPDF
 from docx import Document
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
     from .toc_pattern_matcher import TOCPatternMatcher
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
     from toc_pattern_matcher import TOCPatternMatcher
 
 

+ 2 - 2
core/base/doc_worker/toc_extractor.py → core/construction_review/component/doc_worker/toc/toc_extractor.py

@@ -7,12 +7,12 @@ from pathlib import Path
 from typing import Union
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
     from .document_extractor_toc import DocumentExtractorTOC
     from .toc_pattern_matcher import TOCPatternMatcher
     from .toc_level_identifier import TOCLevelIdentifier
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
     from document_extractor_toc import DocumentExtractorTOC
     from toc_pattern_matcher import TOCPatternMatcher
     from toc_level_identifier import TOCLevelIdentifier

+ 2 - 2
core/base/doc_worker/toc_level_identifier.py → core/construction_review/component/doc_worker/toc/toc_level_identifier.py

@@ -6,9 +6,9 @@
 import re
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
 
 
 class TOCLevelIdentifier:

+ 2 - 2
core/base/doc_worker/toc_pattern_matcher.py → core/construction_review/component/doc_worker/toc/toc_pattern_matcher.py

@@ -6,9 +6,9 @@
 import re
 
 try:
-    from .config_loader import get_config
+    from ..config.config_loader import get_config
 except ImportError:
-    from config_loader import get_config
+    from config.config_loader import get_config
 
 
 class TOCPatternMatcher:

+ 4 - 4
core/construction_review/component/document_processor.py

@@ -15,11 +15,11 @@ from foundation.logger.loggering import server_logger as logger
 
 # 引入doc_worker核心组件
 try:
-    from base.doc_worker import TOCExtractor, TextSplitter, LLMClassifier
-    from base.doc_worker.config_loader import get_config
+    from .doc_worker import TOCExtractor, TextSplitter, LLMClassifier
+    from .doc_worker.config.config_loader import get_config
 except ImportError:
-    from core.base.doc_worker import TOCExtractor, TextSplitter, LLMClassifier
-    from core.base.doc_worker.config_loader import get_config
+    from core.construction_review.component.doc_worker import TOCExtractor, TextSplitter, LLMClassifier
+    from core.construction_review.component.doc_worker.config.config_loader import get_config
 
 class DocumentProcessor:
     """文档处理器"""

Fișier diff suprimat deoarece este prea mare
+ 12 - 44
temp/AI审查结果.json


Unele fișiere nu au fost afișate deoarece prea multe fișiere au fost modificate în acest diff