Kaynağa Gözat

dev:优化文件处理模块的外部引用问题;

ChenJiSheng 2 ay önce
ebeveyn
işleme
cfa7203c55

+ 2 - 2
core/base/__init__.py

@@ -31,7 +31,7 @@ from core.construction_review.component.doc_worker import DocumentClassifier
 from core.construction_review.component.doc_worker.toc.toc_extractor import TOCExtractor
 from core.construction_review.component.doc_worker.chunking.text_splitter import TextSplitter
 from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
-from core.construction_review.component.doc_worker.classification.llm_classifier import LLMClassifier
+from core.construction_review.component.doc_worker.classification.rule_based_classifier import RuleBasedClassifier
 from core.construction_review.component.doc_worker.output.result_saver import ResultSaver
 
 
@@ -40,7 +40,7 @@ __all__ = [
     'TOCExtractor',
     'TextSplitter',
     'HierarchyClassifier',
-    'LLMClassifier',
+    'RuleBasedClassifier',
     'ResultSaver'
 ]
 

+ 23 - 16
core/construction_review/component/doc_worker/__init__.py

@@ -26,27 +26,34 @@
 __version__ = "2.0.0"
 __author__ = "Your Name"
 
-try:
-    from .core import DocumentClassifier
-    from .toc.toc_extractor import TOCExtractor
-    from .chunking.text_splitter import TextSplitter
-    from .classification.hierarchy_classifier import HierarchyClassifier
-    from .classification.llm_classifier import LLMClassifier
-    from .output.result_saver import ResultSaver
-except ImportError:
-    from core import DocumentClassifier
-    from toc.toc_extractor import TOCExtractor
-    from chunking.text_splitter import TextSplitter
-    from classification.hierarchy_classifier import HierarchyClassifier
-    from classification.llm_classifier import LLMClassifier
-    from output.result_saver import ResultSaver
+from .core import DocumentClassifier
+from .toc.toc_extractor import TOCExtractor
+from .chunking.text_splitter import TextSplitter
+from .classification.hierarchy_classifier import HierarchyClassifier
+from .classification.rule_based_classifier import RuleBasedClassifier
+from .output.result_saver import ResultSaver
+
+
+class LLMClassifier:
+    """
+    占位LLM分类器,避免未实现类的导入错误。
+    当前仅提供接口占位,后续可替换为真实的LLM服务实现。
+    """
+
+    def __init__(self, model_url: str):
+        self.model_url = model_url
+
+    def classify(self, toc_items, target_level=None):
+        # 返回None以触发上层的回退逻辑
+        return None
 
 __all__ = [
     'DocumentClassifier',
     'TOCExtractor',
     'TextSplitter',
     'HierarchyClassifier',
-    'LLMClassifier',
-    'ResultSaver'
+    'RuleBasedClassifier',
+    'ResultSaver',
+    'LLMClassifier'
 ]
 

+ 2 - 2
core/construction_review/component/doc_worker/classification/__init__.py

@@ -2,9 +2,9 @@
 分类模块
 """
 
-from .llm_classifier import LLMClassifier
+from .rule_based_classifier import RuleBasedClassifier
 from .hierarchy_classifier import HierarchyClassifier
 
-__all__ = ['LLMClassifier', 'HierarchyClassifier']
+__all__ = ['RuleBasedClassifier', 'HierarchyClassifier']
 
 

+ 5 - 10
core/construction_review/component/doc_worker/classification/llm_classifier.py → core/construction_review/component/doc_worker/classification/rule_based_classifier.py

@@ -11,17 +11,12 @@ except ImportError:
     from config.config_loader import get_config
 
 
-class LLMClassifier:
-    """目录分类器(基于正则表达式和关键词匹配)"""
+class RuleBasedClassifier:
+    """基于规则的目录分类器(使用正则表达式和关键词匹配)"""
     
-    def __init__(self, model_url=None, model_name=None, api_key=None):
+    def __init__(self):
         """
         初始化分类器
-        
-        参数:
-            model_url: 模型API地址(已废弃,保留以兼容旧接口)
-            model_name: 模型名称(已废弃,保留以兼容旧接口)
-            api_key: API密钥(已废弃,保留以兼容旧接口)
         """
         self.config = get_config()
         self.category_mapping = self.config.category_mapping
@@ -118,8 +113,8 @@ class LLMClassifier:
                 if keyword in title or keyword in title_clean:
                     return category
         
-        # 默认返回"其资料"
-        return "其资料"
+        # 默认返回"其资料"
+        return "其资料"
     
     def _remove_number_prefix(self, title):
         """

+ 1 - 1
core/construction_review/component/doc_worker/config/config_loader.py

@@ -48,7 +48,7 @@ class Config:
         获取配置值
         
         参数:
-            key_path: 配置键路径,用点号分隔,如 'llm.model_url'
+            key_path: 配置键路径,用点号分隔,如 'categories.mapping'
             default: 默认值
             
         返回:

+ 5 - 12
core/construction_review/component/doc_worker/core.py

@@ -7,18 +7,11 @@ from pathlib import Path
 from collections import Counter
 import time
 
-try:
-    from .toc.toc_extractor import TOCExtractor
-    from .classification.hierarchy_classifier import HierarchyClassifier
-    from .chunking.text_splitter import TextSplitter
-    from .output.result_saver import ResultSaver
-    from .config.config_loader import get_config
-except ImportError:
-    from toc.toc_extractor import TOCExtractor
-    from classification.hierarchy_classifier import HierarchyClassifier
-    from chunking.text_splitter import TextSplitter
-    from output.result_saver import ResultSaver
-    from config.config_loader import get_config
+from .toc.toc_extractor import TOCExtractor
+from .classification.hierarchy_classifier import HierarchyClassifier
+from .chunking.text_splitter import TextSplitter
+from .output.result_saver import ResultSaver
+from .config.config_loader import get_config
 
 
 class DocumentClassifier: