1 lună în urmă · 927e84d375
--- a/core/construction_review/component/ai_review_engine.py
+++ b/core/construction_review/component/ai_review_engine.py
@@ -1069,79 +1069,46 @@ class AIReviewEngine(BaseReviewer):
 
															             # 使用模糊匹配器
														
 
															             matcher = OutlineCatalogueMatcher(csv_path, raw_content_csv)
														
 
															-            # 构建 outline 标题到分类代码的映射
														
 
															+            # 🆕 构建 match_catalogue_by_title 需要的 outline_by_first 结构
														
 
															             # outline: [{"chapter_classification": "basis", "title": "第一章 编制依据", "subsections": [...]}]
														
 
															-            outline_first_map = {}  # title -> chapter_classification
														
 
															-            outline_second_map = {}  # (first_title, sub_title) -> secondary_category_code
														
 
															+            outline_by_first = {}
														
 
															             for chapter in outline_chapters:
														
 
															                 if not isinstance(chapter, dict):
														
 
															                     continue
														
 
															                 first_code = chapter.get('chapter_classification', '')
														
 
															                 first_title = chapter.get('title', '')
														
 
															-                if first_code and first_title:
														
 
															-                    outline_first_map[first_title] = first_code
														
 
															+                
														
 
															+                # 如果code为空但有title，用title的hash作为临时key
														
 
															+                if not first_code and first_title:
														
 
															+                    import hashlib
														
 
															+                    first_code = f"__title_{hashlib.md5(first_title.encode()).hexdigest()[:8]}"
														
 
															+                    logger.debug(f"[{name}] 一级code为空，使用临时key: {first_code} -> {first_title}")
														
 
															+                
														
 
															+                if not first_code:
														
 
															+                    continue
														
 
															+                # 收集二级标题
														
 
															+                subsections = []
														
 
															                 for sub in chapter.get('subsections', []):
														
 
															                     if isinstance(sub, dict):
														
 
															                         sub_title = sub.get('title', '')
														
 
															-                        second_code = sub.get('secondary_category_code', '')
														
 
															-                        if first_title and sub_title and second_code:
														
 
															-                            outline_second_map[(first_title, sub_title)] = second_code
														
 
															-
														
 
															-            # 使用 catalog 的标题，匹配 outline 的分类代码
														
 
															-            outline_first = set()
														
 
															-            outline_secondary = {}
														
 
															-
														
 
															-            for chapter in catalog_chapters:
														
 
															-                if not isinstance(chapter, dict):
														
 
															-                    continue
														
 
															-
														
 
															-                catalog_title = chapter.get('title', '')
														
 
															-                if not catalog_title:
														
 
															-                    continue
														
 
															-
														
 
															-                # 尝试从 outline 匹配一级分类代码
														
 
															-                first_code = outline_first_map.get(catalog_title)
														
 
															-
														
 
															-                # 如果精确匹配失败，尝试模糊匹配
														
 
															-                if not first_code:
														
 
															-                    for outline_title, code in outline_first_map.items():
														
 
															-                        if matcher._calculate_similarity(catalog_title, outline_title) > 0.7:
														
 
															-                            first_code = code
														
 
															-                            break
														
 
															-
														
 
															-                if first_code:
														
 
															-                    outline_first.add(first_code)
														
 
															-
														
 
															-                # 匹配二级分类
														
 
															-                for sub in chapter.get('subsections', []):
														
 
															-                    if not isinstance(sub, dict):
														
 
															-                        continue
														
 
															-                    sub_title = sub.get('title', '')
														
 
															-                    if not sub_title:
														
 
															-                        continue
														
 
															-
														
 
															-                    # 尝试精确匹配
														
 
															-                    second_code = outline_second_map.get((catalog_title, sub_title))
														
 
															-
														
 
															-                    # 模糊匹配
														
 
															-                    if not second_code and first_code:
														
 
															-                        for (outline_first, outline_sub), code in outline_second_map.items():
														
 
															-                            if matcher._calculate_similarity(catalog_title, outline_first) > 0.7 and \
														
 
															-                               matcher._calculate_similarity(sub_title, outline_sub) > 0.7:
														
 
															-                                second_code = code
														
 
															-                                break
														
 
															-
														
 
															-                    if first_code and second_code:
														
 
															-                        outline_secondary[(first_code, second_code)] = sub_title
														
 
															+                        if sub_title:
														
 
															+                            subsections.append(sub_title)
														
 
															+                    elif isinstance(sub, str):
														
 
															+                        if sub:
														
 
															+                            subsections.append(sub)
														
 
															+
														
 
															+                outline_by_first[first_code] = {
														
 
															+                    'title': first_title,
														
 
															+                    'subsections': subsections
														
 
															+                }
														
 
															-            logger.info(f"[{name}] 匹配到 {len(outline_first)} 个一级, {len(outline_secondary)} 个二级")
														
 
															+            logger.info(f"[{name}] 提取到 {len(outline_by_first)} 个一级目录")
														
 
															             # 执行标准目录匹配检查
														
 
															-            match_result = matcher.match_catalogue(
														
 
															-                outline_first=outline_first,
														
 
															-                outline_secondary=outline_secondary,
														
 
															+            match_result = matcher.match_catalogue_by_title(
														
 
															+                outline_by_first=outline_by_first,
														
 
															                 threshold=0.6
														
 
															             )
														
--- a/core/construction_review/component/outline_catalogue_matcher.py
+++ b/core/construction_review/component/outline_catalogue_matcher.py
@@ -11,8 +11,6 @@ import difflib
 
															 import logging
														
 
															 import re
														
 
															 from typing import Dict, List, Optional, Set, Tuple, Any
														
 
															-from collections import defaultdict
														
 
															-from pathlib import Path
														
 
															 import pandas as pd
														
@@ -129,10 +127,28 @@ class OutlineCatalogueMatcher:
 
															             pass  # 加载失败不影响主功能
														
 
															     def _normalize_text(self, text: str) -> str:
														
 
															-        """文本标准化"""
														
 
															+        """文本标准化 - 清洗序号、标点、空格"""
														
 
															         if not text:
														
 
															             return ""
														
 
															+        
														
 
															+        # 1. 去除章节序号（如"一、" "5.1 " "(1)" "第1章"等）
														
 
															+        # "第X章/节"格式
														
 
															+        text = re.sub(r'^第[一二三四五六七八九十百千\d]+[章节][、.\s]*', '', text)
														
 
															+        # 中文序号：一、二、三...，（一）、（二）...
														
 
															+        text = re.sub(r'^[(（]?[一二三四五六七八九十百千]+[)）]?[、.\s]*', '', text)
														
 
															+        # 多级数字序号：1.1.1、5.1、1.1.2等（循环处理多级）
														
 
															+        while re.match(r'^\d+[.．]', text):
														
 
															+            text = re.sub(r'^\d+[.．]', '', text)
														
 
															+        # 括号数字：(1)、[1]、（1）
														
 
															+        text = re.sub(r'^[(（\[]\d+[)）\]][、.\s]*', '', text)
														
 
															+        # 单个数字+标点/空格：1)、1]、1】
														
 
															+        text = re.sub(r'^\d+[)）\]】][、.\s]*', '', text)
														
 
															+        # 纯数字+空格（如"5 "）
														
 
															+        text = re.sub(r'^\d+\s+', '', text)
														
 
															+        
														
 
															+        # 2. 去除标点符号
														
 
															         text = re.sub(r'[\s\n\r\t.,;:!?，。；：！？、""''（）()【】\[\]《》<>]', '', text)
														
 
															+        
														
 
															         return text.lower().strip()
														
 
															     def _calculate_similarity(self, text1: str, text2: str) -> float:
														
@@ -284,11 +300,14 @@ class OutlineCatalogueMatcher:
 
															         matched_first = set()
														
 
															         missing_first = []
														
 
															+        # 🆕 建立标准code到实际code的映射（用于二级匹配时找到正确的subsections）
														
 
															+        first_code_mapping: Dict[str, str] = {}  # {标准req_code: 实际outline_code}
														
 
															         for req_code, req_name in self.first_names.items():
														
 
															             # 优先：直接用code精确匹配，因为一级分类通常较准
														
 
															             if req_code in actual_first_titles:
														
 
															                 matched_first.add(req_code)
														
 
															+                first_code_mapping[req_code] = req_code  # 精确匹配，映射到自己
														
 
															                 logger.debug(f"[一级匹配] {req_name}: 存在")
														
 
															             else:
														
 
															                 # 尝试用标题模糊匹配
														
@@ -302,6 +321,7 @@ class OutlineCatalogueMatcher:
 
															                     for code, title in actual_first_titles.items():
														
 
															                         if title == matched_title:
														
 
															                             matched_first.add(req_code)
														
 
															+                            first_code_mapping[req_code] = code  # 记录映射关系
														
 
															                             logger.debug(f"[一级模糊匹配] {req_name} -> {matched_title} ({score:.3f})")
														
 
															                             break
														
 
															                 else:
														
@@ -331,7 +351,9 @@ class OutlineCatalogueMatcher:
 
															             first_code, second_code = req_key
														
 
															             # 🆕 步骤1：优先在同一一级下匹配
														
 
															-            same_group_titles = outline_by_first.get(first_code, {}).get('subsections', [])
														
 
															+            # 使用映射找到实际的code（处理一级模糊匹配后的映射关系）
														
 
															+            actual_first_code = first_code_mapping.get(first_code, first_code)
														
 
															+            same_group_titles = outline_by_first.get(actual_first_code, {}).get('subsections', [])
														
 
															             best_score_same = 0.0
														
 
															             best_match_same = None
														
--- a/views/construction_review/file_upload.py
+++ b/views/construction_review/file_upload.py
@@ -33,12 +33,19 @@ def _find_soffice_path() -> str:
 
															         FileNotFoundError: 未找到 LibreOffice
														
 
															     """
														
 
															     import platform
														
 
															+    import shutil
														
 
															     # Linux/Docker 环境：直接使用 soffice
														
 
															     if platform.system() != 'Windows':
														
 
															         return 'soffice'
														
 
															-    # Windows 环境：检测常见安装路径
														
 
															+    # Windows 环境：首先从 PATH 中查找
														
 
															+    soffice_in_path = shutil.which('soffice')
														
 
															+    if soffice_in_path:
														
 
															+        logger.info(f"从 PATH 找到 LibreOffice: {soffice_in_path}")
														
 
															+        return soffice_in_path
														
 
															+
														
 
															+    # 备选：检测常见安装路径
														
 
															     possible_paths = [
														
 
															         r"C:\Program Files\LibreOffice\program\soffice.exe",
														
 
															         r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",