1 周之前 · 927e84d375
--- a/core/construction_review/component/ai_review_engine.py
+++ b/core/construction_review/component/ai_review_engine.py
@@ -1069,79 +1069,46 @@ class AIReviewEngine(BaseReviewer):
 
				             # 使用模糊匹配器
			
 
				             matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
			
 
				 
			
 
				-            # 构建 outline 标题到分类代码的映射
			
 
				+            # 🆕 构建 match_catalogue_by_title 需要的 outline_by_first 结构
			
 
				             # outline: [{"chapter_classification": "basis", "title": "第一章 编制依据", "subsections": [...]}]
			
 
				-            outline_first_map = {}  # title -> chapter_classification
			
 
				-            outline_second_map = {}  # (first_title, sub_title) -> secondary_category_code
			
 
				+            outline_by_first = {}
			
 
				 
			
 
				             for chapter in outline_chapters:
			
 
				                 if not isinstance(chapter, dict):
			
 
				                     continue
			
 
				                 first_code = chapter.get('chapter_classification', '')
			
 
				                 first_title = chapter.get('title', '')
			
 
				-                if first_code and first_title:
			
 
				-                    outline_first_map[first_title] = first_code
			
 
				+                
			
 
				+                # 如果code为空但有title，用title的hash作为临时key
			
 
				+                if not first_code and first_title:
			
 
				+                    import hashlib
			
 
				+                    first_code = f"__title_{hashlib.md5(first_title.encode()).hexdigest()[:8]}"
			
 
				+                    logger.debug(f"[{name}] 一级code为空，使用临时key: {first_code} -> {first_title}")
			
 
				+                
			
 
				+                if not first_code:
			
 
				+                    continue
			
 
				 
			
 
				+                # 收集二级标题
			
 
				+                subsections = []
			
 
				                 for sub in chapter.get('subsections', []):
			
 
				                     if isinstance(sub, dict):
			
 
				                         sub_title = sub.get('title', '')
			
 
				-                        second_code = sub.get('secondary_category_code', '')
			
 
				-                        if first_title and sub_title and second_code:
			
 
				-                            outline_second_map[(first_title, sub_title)] = second_code
			
 
				-
			
 
				-            # 使用 catalog 的标题，匹配 outline 的分类代码
			
 
				-            outline_first = set()
			
 
				-            outline_secondary = {}
			
 
				-
			
 
				-            for chapter in catalog_chapters:
			
 
				-                if not isinstance(chapter, dict):
			
 
				-                    continue
			
 
				-
			
 
				-                catalog_title = chapter.get('title', '')
			
 
				-                if not catalog_title:
			
 
				-                    continue
			
 
				-
			
 
				-                # 尝试从 outline 匹配一级分类代码
			
 
				-                first_code = outline_first_map.get(catalog_title)
			
 
				-
			
 
				-                # 如果精确匹配失败，尝试模糊匹配
			
 
				-                if not first_code:
			
 
				-                    for outline_title, code in outline_first_map.items():
			
 
				-                        if matcher._calculate_similarity(catalog_title, outline_title) > 0.7:
			
 
				-                            first_code = code
			
 
				-                            break
			
 
				-
			
 
				-                if first_code:
			
 
				-                    outline_first.add(first_code)
			
 
				-
			
 
				-                # 匹配二级分类
			
 
				-                for sub in chapter.get('subsections', []):
			
 
				-                    if not isinstance(sub, dict):
			
 
				-                        continue
			
 
				-                    sub_title = sub.get('title', '')
			
 
				-                    if not sub_title:
			
 
				-                        continue
			
 
				-
			
 
				-                    # 尝试精确匹配
			
 
				-                    second_code = outline_second_map.get((catalog_title, sub_title))
			
 
				-
			
 
				-                    # 模糊匹配
			
 
				-                    if not second_code and first_code:
			
 
				-                        for (outline_first, outline_sub), code in outline_second_map.items():
			
 
				-                            if matcher._calculate_similarity(catalog_title, outline_first) > 0.7 and \
			
 
				-                               matcher._calculate_similarity(sub_title, outline_sub) > 0.7:
			
 
				-                                second_code = code
			
 
				-                                break
			
 
				-
			
 
				-                    if first_code and second_code:
			
 
				-                        outline_secondary[(first_code, second_code)] = sub_title
			
 
				+                        if sub_title:
			
 
				+                            subsections.append(sub_title)
			
 
				+                    elif isinstance(sub, str):
			
 
				+                        if sub:
			
 
				+                            subsections.append(sub)
			
 
				+
			
 
				+                outline_by_first[first_code] = {
			
 
				+                    'title': first_title,
			
 
				+                    'subsections': subsections
			
 
				+                }
			
 
				 
			
 
				-            logger.info(f"[{name}] 匹配到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
			
 
				+            logger.info(f"[{name}] 提取到 {len(outline_by_first)} 个一级目录")
			
 
				 
			
 
				             # 执行标准目录匹配检查
			
 
				-            match_result = matcher.match_catalogue(
			
 
				-                outline_first=outline_first,
			
 
				-                outline_secondary=outline_secondary,
			
 
				+            match_result = matcher.match_catalogue_by_title(
			
 
				+                outline_by_first=outline_by_first,
			
 
				                 threshold=0.6
			
 
				             )
			
 
				             
			
--- a/core/construction_review/component/outline_catalogue_matcher.py
+++ b/core/construction_review/component/outline_catalogue_matcher.py
@@ -11,8 +11,6 @@ import difflib
 
				 import logging
			
 
				 import re
			
 
				 from typing import Dict, List, Optional, Set, Tuple, Any
			
 
				-from collections import defaultdict
			
 
				-from pathlib import Path
			
 
				 
			
 
				 import pandas as pd
			
 
				 
			
@@ -129,10 +127,28 @@ class OutlineCatalogueMatcher:
 
				             pass  # 加载失败不影响主功能
			
 
				     
			
 
				     def _normalize_text(self, text: str) -> str:
			
 
				-        """文本标准化"""
			
 
				+        """文本标准化 - 清洗序号、标点、空格"""
			
 
				         if not text:
			
 
				             return ""
			
 
				+        
			
 
				+        # 1. 去除章节序号（如"一、" "5.1 " "(1)" "第1章"等）
			
 
				+        # "第X章/节"格式
			
 
				+        text = re.sub(r'^第[一二三四五六七八九十百千\d]+[章节][、.\s]*', '', text)
			
 
				+        # 中文序号：一、二、三...，（一）、（二）...
			
 
				+        text = re.sub(r'^[(（]?[一二三四五六七八九十百千]+[)）]?[、.\s]*', '', text)
			
 
				+        # 多级数字序号：1.1.1、5.1、1.1.2等（循环处理多级）
			
 
				+        while re.match(r'^\d+[.．]', text):
			
 
				+            text = re.sub(r'^\d+[.．]', '', text)
			
 
				+        # 括号数字：(1)、[1]、（1）
			
 
				+        text = re.sub(r'^[(（\[]\d+[)）\]][、.\s]*', '', text)
			
 
				+        # 单个数字+标点/空格：1)、1]、1】
			
 
				+        text = re.sub(r'^\d+[)）\]】][、.\s]*', '', text)
			
 
				+        # 纯数字+空格（如"5 "）
			
 
				+        text = re.sub(r'^\d+\s+', '', text)
			
 
				+        
			
 
				+        # 2. 去除标点符号
			
 
				         text = re.sub(r'[\s\n\r\t.,;:!?，。；：！？、""''（）()【】\[\]《》<>]', '', text)
			
 
				+        
			
 
				         return text.lower().strip()
			
 
				     
			
 
				     def _calculate_similarity(self, text1: str, text2: str) -> float:
			
@@ -284,11 +300,14 @@ class OutlineCatalogueMatcher:
 
				         
			
 
				         matched_first = set()
			
 
				         missing_first = []
			
 
				+        # 🆕 建立标准code到实际code的映射（用于二级匹配时找到正确的subsections）
			
 
				+        first_code_mapping: Dict[str, str] = {}  # {标准req_code: 实际outline_code}
			
 
				         
			
 
				         for req_code, req_name in self.first_names.items():
			
 
				             # 优先：直接用code精确匹配，因为一级分类通常较准
			
 
				             if req_code in actual_first_titles:
			
 
				                 matched_first.add(req_code)
			
 
				+                first_code_mapping[req_code] = req_code  # 精确匹配，映射到自己
			
 
				                 logger.debug(f"[一级匹配] {req_name}: 存在")
			
 
				             else:
			
 
				                 # 尝试用标题模糊匹配
			
@@ -302,6 +321,7 @@ class OutlineCatalogueMatcher:
 
				                     for code, title in actual_first_titles.items():
			
 
				                         if title == matched_title:
			
 
				                             matched_first.add(req_code)
			
 
				+                            first_code_mapping[req_code] = code  # 记录映射关系
			
 
				                             logger.debug(f"[一级模糊匹配] {req_name} -> {matched_title} ({score:.3f})")
			
 
				                             break
			
 
				                 else:
			
@@ -331,7 +351,9 @@ class OutlineCatalogueMatcher:
 
				             first_code, second_code = req_key
			
 
				             
			
 
				             # 🆕 步骤1：优先在同一一级下匹配
			
 
				-            same_group_titles = outline_by_first.get(first_code, {}).get('subsections', [])
			
 
				+            # 使用映射找到实际的code（处理一级模糊匹配后的映射关系）
			
 
				+            actual_first_code = first_code_mapping.get(first_code, first_code)
			
 
				+            same_group_titles = outline_by_first.get(actual_first_code, {}).get('subsections', [])
			
 
				             best_score_same = 0.0
			
 
				             best_match_same = None
			
 
				             
			
--- a/views/construction_review/file_upload.py
+++ b/views/construction_review/file_upload.py
@@ -33,12 +33,19 @@ def _find_soffice_path() -> str:
 
				         FileNotFoundError: 未找到 LibreOffice
			
 
				     """
			
 
				     import platform
			
 
				+    import shutil
			
 
				 
			
 
				     # Linux/Docker 环境：直接使用 soffice
			
 
				     if platform.system() != 'Windows':
			
 
				         return 'soffice'
			
 
				 
			
 
				-    # Windows 环境：检测常见安装路径
			
 
				+    # Windows 环境：首先从 PATH 中查找
			
 
				+    soffice_in_path = shutil.which('soffice')
			
 
				+    if soffice_in_path:
			
 
				+        logger.info(f"从 PATH 找到 LibreOffice: {soffice_in_path}")
			
 
				+        return soffice_in_path
			
 
				+
			
 
				+    # 备选：检测常见安装路径
			
 
				     possible_paths = [
			
 
				         r"C:\Program Files\LibreOffice\program\soffice.exe",
			
 
				         r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",