浏览代码

fix(sgsc-目录缺失模块-xth): 修复目录缺失检查 UnboundLocalError 及匹配逻辑错误

xgo 1 周之前
父节点
当前提交
927e84d375

+ 26 - 59
core/construction_review/component/ai_review_engine.py

@@ -1069,79 +1069,46 @@ class AIReviewEngine(BaseReviewer):
             # 使用模糊匹配器
             matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
 
-            # 构建 outline 标题到分类代码的映射
+            # 🆕 构建 match_catalogue_by_title 需要的 outline_by_first 结构
             # outline: [{"chapter_classification": "basis", "title": "第一章 编制依据", "subsections": [...]}]
-            outline_first_map = {}  # title -> chapter_classification
-            outline_second_map = {}  # (first_title, sub_title) -> secondary_category_code
+            outline_by_first = {}
 
             for chapter in outline_chapters:
                 if not isinstance(chapter, dict):
                     continue
                 first_code = chapter.get('chapter_classification', '')
                 first_title = chapter.get('title', '')
-                if first_code and first_title:
-                    outline_first_map[first_title] = first_code
+                
+                # 如果code为空但有title,用title的hash作为临时key
+                if not first_code and first_title:
+                    import hashlib
+                    first_code = f"__title_{hashlib.md5(first_title.encode()).hexdigest()[:8]}"
+                    logger.debug(f"[{name}] 一级code为空,使用临时key: {first_code} -> {first_title}")
+                
+                if not first_code:
+                    continue
 
+                # 收集二级标题
+                subsections = []
                 for sub in chapter.get('subsections', []):
                     if isinstance(sub, dict):
                         sub_title = sub.get('title', '')
-                        second_code = sub.get('secondary_category_code', '')
-                        if first_title and sub_title and second_code:
-                            outline_second_map[(first_title, sub_title)] = second_code
-
-            # 使用 catalog 的标题,匹配 outline 的分类代码
-            outline_first = set()
-            outline_secondary = {}
-
-            for chapter in catalog_chapters:
-                if not isinstance(chapter, dict):
-                    continue
-
-                catalog_title = chapter.get('title', '')
-                if not catalog_title:
-                    continue
-
-                # 尝试从 outline 匹配一级分类代码
-                first_code = outline_first_map.get(catalog_title)
-
-                # 如果精确匹配失败,尝试模糊匹配
-                if not first_code:
-                    for outline_title, code in outline_first_map.items():
-                        if matcher._calculate_similarity(catalog_title, outline_title) > 0.7:
-                            first_code = code
-                            break
-
-                if first_code:
-                    outline_first.add(first_code)
-
-                # 匹配二级分类
-                for sub in chapter.get('subsections', []):
-                    if not isinstance(sub, dict):
-                        continue
-                    sub_title = sub.get('title', '')
-                    if not sub_title:
-                        continue
-
-                    # 尝试精确匹配
-                    second_code = outline_second_map.get((catalog_title, sub_title))
-
-                    # 模糊匹配
-                    if not second_code and first_code:
-                        for (outline_first, outline_sub), code in outline_second_map.items():
-                            if matcher._calculate_similarity(catalog_title, outline_first) > 0.7 and \
-                               matcher._calculate_similarity(sub_title, outline_sub) > 0.7:
-                                second_code = code
-                                break
-
-                    if first_code and second_code:
-                        outline_secondary[(first_code, second_code)] = sub_title
+                        if sub_title:
+                            subsections.append(sub_title)
+                    elif isinstance(sub, str):
+                        if sub:
+                            subsections.append(sub)
+
+                outline_by_first[first_code] = {
+                    'title': first_title,
+                    'subsections': subsections
+                }
 
-            logger.info(f"[{name}] 匹配到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
+            logger.info(f"[{name}] 提取到 {len(outline_by_first)} 个一级目录")
 
             # 执行标准目录匹配检查
-            match_result = matcher.match_catalogue(
-                outline_first=outline_first,
-                outline_secondary=outline_secondary,
+            match_result = matcher.match_catalogue_by_title(
+                outline_by_first=outline_by_first,
                 threshold=0.6
             )
             

+ 26 - 4
core/construction_review/component/outline_catalogue_matcher.py

@@ -11,8 +11,6 @@ import difflib
 import logging
 import re
 from typing import Dict, List, Optional, Set, Tuple, Any
-from collections import defaultdict
-from pathlib import Path
 
 import pandas as pd
 
@@ -129,10 +127,28 @@ class OutlineCatalogueMatcher:
             pass  # 加载失败不影响主功能
     
     def _normalize_text(self, text: str) -> str:
-        """文本标准化"""
+        """文本标准化 - 清洗序号、标点、空格"""
         if not text:
             return ""
+        
+        # 1. 去除章节序号(如"一、" "5.1 " "(1)" "第1章"等)
+        # "第X章/节"格式
+        text = re.sub(r'^第[一二三四五六七八九十百千\d]+[章节][、.\s]*', '', text)
+        # 中文序号:一、二、三...,(一)、(二)...
+        text = re.sub(r'^[((]?[一二三四五六七八九十百千]+[))]?[、.\s]*', '', text)
+        # 多级数字序号:1.1.1、5.1、1.1.2等(循环处理多级)
+        while re.match(r'^\d+[..]', text):
+            text = re.sub(r'^\d+[..]', '', text)
+        # 括号数字:(1)、[1]、(1)
+        text = re.sub(r'^[((\[]\d+[))\]][、.\s]*', '', text)
+        # 单个数字+标点/空格:1)、1]、1】
+        text = re.sub(r'^\d+[))\]】][、.\s]*', '', text)
+        # 纯数字+空格(如"5 ")
+        text = re.sub(r'^\d+\s+', '', text)
+        
+        # 2. 去除标点符号
         text = re.sub(r'[\s\n\r\t.,;:!?,。;:!?、""''()()【】\[\]《》<>]', '', text)
+        
         return text.lower().strip()
     
     def _calculate_similarity(self, text1: str, text2: str) -> float:
@@ -284,11 +300,14 @@ class OutlineCatalogueMatcher:
         
         matched_first = set()
         missing_first = []
+        # 🆕 建立标准code到实际code的映射(用于二级匹配时找到正确的subsections)
+        first_code_mapping: Dict[str, str] = {}  # {标准req_code: 实际outline_code}
         
         for req_code, req_name in self.first_names.items():
             # 优先:直接用code精确匹配,因为一级分类通常较准
             if req_code in actual_first_titles:
                 matched_first.add(req_code)
+                first_code_mapping[req_code] = req_code  # 精确匹配,映射到自己
                 logger.debug(f"[一级匹配] {req_name}: 存在")
             else:
                 # 尝试用标题模糊匹配
@@ -302,6 +321,7 @@ class OutlineCatalogueMatcher:
                     for code, title in actual_first_titles.items():
                         if title == matched_title:
                             matched_first.add(req_code)
+                            first_code_mapping[req_code] = code  # 记录映射关系
                             logger.debug(f"[一级模糊匹配] {req_name} -> {matched_title} ({score:.3f})")
                             break
                 else:
@@ -331,7 +351,9 @@ class OutlineCatalogueMatcher:
             first_code, second_code = req_key
             
             # 🆕 步骤1:优先在同一一级下匹配
-            same_group_titles = outline_by_first.get(first_code, {}).get('subsections', [])
+            # 使用映射找到实际的code(处理一级模糊匹配后的映射关系)
+            actual_first_code = first_code_mapping.get(first_code, first_code)
+            same_group_titles = outline_by_first.get(actual_first_code, {}).get('subsections', [])
             best_score_same = 0.0
             best_match_same = None
             

+ 8 - 1
views/construction_review/file_upload.py

@@ -33,12 +33,19 @@ def _find_soffice_path() -> str:
         FileNotFoundError: 未找到 LibreOffice
     """
     import platform
+    import shutil
 
     # Linux/Docker 环境:直接使用 soffice
     if platform.system() != 'Windows':
         return 'soffice'
 
-    # Windows 环境:检测常见安装路径
+    # Windows 环境:首先从 PATH 中查找
+    soffice_in_path = shutil.which('soffice')
+    if soffice_in_path:
+        logger.info(f"从 PATH 找到 LibreOffice: {soffice_in_path}")
+        return soffice_in_path
+
+    # 备选:检测常见安装路径
     possible_paths = [
         r"C:\Program Files\LibreOffice\program\soffice.exe",
         r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",