3 недель назад · 67acc3be81
--- a/core/construction_review/component/doc_worker/pdf_worker/html_to_markdown.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/html_to_markdown.py
@@ -0,0 +1,246 @@
 
				+"""
			
 
				+HTML 到 Markdown 转换器
			
 
				+
			
 
				+用于将 MinerU 返回的 HTML 格式转换为 Markdown 格式。
			
 
				+使用 markdownify 库，支持表格、列表、标题等复杂结构转换。
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import re
			
 
				+from typing import Optional
			
 
				+
			
 
				+try:
			
 
				+    import markdownify
			
 
				+    MARKDOWNIFY_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    MARKDOWNIFY_AVAILABLE = False
			
 
				+
			
 
				+
			
 
				+class HTMLToMarkdownConverter:
			
 
				+    """HTML 到 Markdown 转换器"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        """初始化转换器"""
			
 
				+        if not MARKDOWNIFY_AVAILABLE:
			
 
				+            raise ImportError(
			
 
				+                "markdownify 库未安装。请运行: pip install markdownify"
			
 
				+            )
			
 
				+    
			
 
				+    def convert(self, html_content: str, **options) -> str:
			
 
				+        """
			
 
				+        将 HTML 转换为 Markdown
			
 
				+        
			
 
				+        参数:
			
 
				+            html_content: HTML 内容
			
 
				+            **options: 转换选项
			
 
				+                - heading_style: 标题样式 ("ATX" 或 "SETEXT"), 默认 "ATX"
			
 
				+                - bullets: 列表符号, 默认 "*"
			
 
				+                - strip: 是否删除未知标签, 默认 False
			
 
				+                - convert_tables: 是否转换表格, 默认 True
			
 
				+                - escape_asterisks: 是否转义星号, 默认 False
			
 
				+                - escape_underscores: 是否转义下划线, 默认 False
			
 
				+        
			
 
				+        返回:
			
 
				+            Markdown 格式文本
			
 
				+        """
			
 
				+        if not html_content or not html_content.strip():
			
 
				+            return ""
			
 
				+        
			
 
				+        # 默认选项
			
 
				+        default_options = {
			
 
				+            'heading_style': 'ATX',  # ATX: # 标题, SETEXT: 下划线标题
			
 
				+            'bullets': '*',          # 列表符号
			
 
				+            'convert': ['b', 'strong', 'i', 'em', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 
			
 
				+                       'ul', 'ol', 'li', 'br', 'hr', 'a', 'img', 'table', 'tr', 'td', 'th'],
			
 
				+        }
			
 
				+        
			
 
				+        # 合并用户选项（排除冲突的 strip 参数）
			
 
				+        for key, value in options.items():
			
 
				+            if key != 'strip':  # 避免与 convert 冲突
			
 
				+                default_options[key] = value
			
 
				+        
			
 
				+        try:
			
 
				+            # 使用 markdownify 转换
			
 
				+            markdown_content = markdownify.markdownify(
			
 
				+                html_content,
			
 
				+                **default_options
			
 
				+            )
			
 
				+            
			
 
				+            # 后处理：清理多余空行、修复表格格式等
			
 
				+            markdown_content = self._post_process(markdown_content)
			
 
				+            
			
 
				+            return markdown_content
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            print(f"[WARN] HTML 转 Markdown 失败: {e}")
			
 
				+            # 降级方案：返回清理后的纯文本
			
 
				+            return self._fallback_convert(html_content)
			
 
				+    
			
 
				+    def _post_process(self, markdown_content: str) -> str:
			
 
				+        """
			
 
				+        后处理 Markdown 内容
			
 
				+        
			
 
				+        1. 清理多余空行
			
 
				+        2. 修复表格格式
			
 
				+        3. 清理 HTML 标签残留
			
 
				+        """
			
 
				+        if not markdown_content:
			
 
				+            return ""
			
 
				+        
			
 
				+        # 1. 清理多余空行（超过2个连续空行改为2个）
			
 
				+        markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
			
 
				+        
			
 
				+        # 2. 修复表格分隔符（markdownify 生成的表格可能需要调整）
			
 
				+        # 确保表格分隔行格式正确 |---|---|
			
 
				+        markdown_content = self._fix_table_format(markdown_content)
			
 
				+        
			
 
				+        # 3. 清理残留的 HTML 标签（如 <span>, <div> 等）
			
 
				+        markdown_content = self._clean_html_tags(markdown_content)
			
 
				+        
			
 
				+        # 4. 清理行首行尾空格
			
 
				+        lines = [line.strip() for line in markdown_content.split('\n')]
			
 
				+        markdown_content = '\n'.join(lines)
			
 
				+        
			
 
				+        # 5. 再次清理多余空行
			
 
				+        markdown_content = re.sub(r'\n{3,}', '\n\n', markdown_content)
			
 
				+        
			
 
				+        return markdown_content.strip()
			
 
				+    
			
 
				+    def _fix_table_format(self, markdown_content: str) -> str:
			
 
				+        """修复表格格式"""
			
 
				+        lines = markdown_content.split('\n')
			
 
				+        result_lines = []
			
 
				+        i = 0
			
 
				+        
			
 
				+        while i < len(lines):
			
 
				+            line = lines[i]
			
 
				+            
			
 
				+            # 检测表格行（以 | 开头和结尾）
			
 
				+            if line.strip().startswith('|') and line.strip().endswith('|'):
			
 
				+                # 检查下一行是否是分隔行
			
 
				+                if i + 1 < len(lines):
			
 
				+                    next_line = lines[i + 1]
			
 
				+                    # 如果下一行不是分隔符行，则插入分隔行
			
 
				+                    if not re.match(r'\s*\|[-:|\s]+\|', next_line):
			
 
				+                        # 计算列数
			
 
				+                        col_count = len([c for c in line.split('|') if c.strip()])
			
 
				+                        if col_count > 0:
			
 
				+                            separator = '|' + '|'.join(['---'] * col_count) + '|'
			
 
				+                            result_lines.append(line)
			
 
				+                            result_lines.append(separator)
			
 
				+                            i += 1
			
 
				+                            continue
			
 
				+                
			
 
				+                # 修复分隔行格式
			
 
				+                if re.match(r'\s*\|[-:|\s]+\|', line):
			
 
				+                    # 标准化分隔行
			
 
				+                    cells = [cell.strip() for cell in line.split('|') if cell.strip()]
			
 
				+                    fixed_cells = []
			
 
				+                    for cell in cells:
			
 
				+                        # 确保是标准分隔符格式
			
 
				+                        if set(cell) <= set('-:| '):
			
 
				+                            fixed_cells.append('---')
			
 
				+                        else:
			
 
				+                            fixed_cells.append(cell)
			
 
				+                    if fixed_cells:
			
 
				+                        line = '|' + '|'.join(fixed_cells) + '|'
			
 
				+            
			
 
				+            result_lines.append(line)
			
 
				+            i += 1
			
 
				+        
			
 
				+        return '\n'.join(result_lines)
			
 
				+    
			
 
				+    def _clean_html_tags(self, markdown_content: str) -> str:
			
 
				+        """清理残留的 HTML 标签"""
			
 
				+        # 清理常见的内联标签，但保留内容
			
 
				+        tags_to_remove = ['span', 'div', 'font', 'b', 'i', 'em', 'strong']
			
 
				+        
			
 
				+        for tag in tags_to_remove:
			
 
				+            # 移除开始标签 <tag ...>
			
 
				+            markdown_content = re.sub(
			
 
				+                rf'<{tag}[^>]*>', 
			
 
				+                '', 
			
 
				+                markdown_content, 
			
 
				+                flags=re.IGNORECASE
			
 
				+            )
			
 
				+            # 移除结束标签 </tag>
			
 
				+            markdown_content = re.sub(
			
 
				+                rf'</{tag}>', 
			
 
				+                '', 
			
 
				+                markdown_content, 
			
 
				+                flags=re.IGNORECASE
			
 
				+            )
			
 
				+        
			
 
				+        # 清理注释
			
 
				+        markdown_content = re.sub(r'<!--.*?-->', '', markdown_content, flags=re.DOTALL)
			
 
				+        
			
 
				+        # 清理 &nbsp; 等 HTML 实体
			
 
				+        markdown_content = markdown_content.replace('&nbsp;', ' ')
			
 
				+        markdown_content = markdown_content.replace('&lt;', '<')
			
 
				+        markdown_content = markdown_content.replace('&gt;', '>')
			
 
				+        markdown_content = markdown_content.replace('&amp;', '&')
			
 
				+        markdown_content = markdown_content.replace('&quot;', '"')
			
 
				+        
			
 
				+        return markdown_content
			
 
				+    
			
 
				+    def _fallback_convert(self, html_content: str) -> str:
			
 
				+        """
			
 
				+        降级转换方案：当 markdownify 失败时使用
			
 
				+        
			
 
				+        简单的标签剥离，保留文本内容
			
 
				+        """
			
 
				+        # 移除 script 和 style 标签及其内容
			
 
				+        text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
			
 
				+        text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
			
 
				+        
			
 
				+        # 将 <br>, <p> 等转为换行
			
 
				+        text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
			
 
				+        text = re.sub(r'</p>', '\n\n', text, flags=re.IGNORECASE)
			
 
				+        
			
 
				+        # 剥离所有 HTML 标签
			
 
				+        text = re.sub(r'<[^>]+>', '', text)
			
 
				+        
			
 
				+        # 清理 HTML 实体
			
 
				+        text = text.replace('&nbsp;', ' ')
			
 
				+        text = text.replace('&lt;', '<')
			
 
				+        text = text.replace('&gt;', '>')
			
 
				+        text = text.replace('&amp;', '&')
			
 
				+        
			
 
				+        # 清理多余空行
			
 
				+        text = re.sub(r'\n{3,}', '\n\n', text)
			
 
				+        
			
 
				+        return text.strip()
			
 
				+
			
 
				+
			
 
				+# 全局转换器实例
			
 
				+_converter: Optional[HTMLToMarkdownConverter] = None
			
 
				+
			
 
				+
			
 
				+def get_converter() -> HTMLToMarkdownConverter:
			
 
				+    """获取全局转换器实例（单例模式）"""
			
 
				+    global _converter
			
 
				+    if _converter is None:
			
 
				+        _converter = HTMLToMarkdownConverter()
			
 
				+    return _converter
			
 
				+
			
 
				+
			
 
				+def convert_html_to_markdown(html_content: str, **options) -> str:
			
 
				+    """
			
 
				+    便捷函数：将 HTML 转换为 Markdown
			
 
				+    
			
 
				+    参数:
			
 
				+        html_content: HTML 内容
			
 
				+        **options: 转换选项
			
 
				+    
			
 
				+    返回:
			
 
				+        Markdown 格式文本
			
 
				+    """
			
 
				+    try:
			
 
				+        converter = get_converter()
			
 
				+        return converter.convert(html_content, **options)
			
 
				+    except ImportError:
			
 
				+        # 如果 markdownify 未安装，使用降级方案
			
 
				+        print("[WARN] markdownify 未安装，使用降级转换方案")
			
 
				+        converter = HTMLToMarkdownConverter.__new__(HTMLToMarkdownConverter)
			
 
				+        return converter._fallback_convert(html_content)
			
--- a/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
+++ b/core/construction_review/component/doc_worker/pdf_worker/mineru_extractor.py
@@ -2,12 +2,14 @@
 
				 MinerU 本地部署版本全文提取实现
			
 
				 
			
 
				 使用本地部署的 MinerU 服务进行 OCR 识别
			
 
				+支持返回 HTML 格式自动转换为 Markdown
			
 
				 """
			
 
				 
			
 
				 from __future__ import annotations
			
 
				 
			
 
				 import json
			
 
				 import os
			
 
				+import re
			
 
				 import requests
			
 
				 from pathlib import Path
			
 
				 from typing import Any, Dict, List, Optional
			
@@ -15,6 +17,13 @@ from typing import Any, Dict, List, Optional
 
				 from ..config.provider import default_config_provider
			
 
				 from ..interfaces import DocumentSource, FullTextExtractor
			
 
				 
			
 
				+# 尝试导入 HTML 到 Markdown 转换器
			
 
				+try:
			
 
				+    from .html_to_markdown import convert_html_to_markdown, HTMLToMarkdownConverter
			
 
				+    HTML_CONVERTER_AVAILABLE = True
			
 
				+except ImportError:
			
 
				+    HTML_CONVERTER_AVAILABLE = False
			
 
				+
			
 
				 
			
 
				 class LocalMinerUFullTextExtractor(FullTextExtractor):
			
 
				     """使用本地部署的 MinerU 提取 PDF 全文内容。"""
			
@@ -123,6 +132,8 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				     def _extract_markdown_from_result(self, result: Dict[str, Any]) -> str:
			
 
				         """
			
 
				         从 MinerU 返回结果中提取 markdown 内容。
			
 
				+        
			
 
				+        支持自动检测 HTML 格式并转换为 Markdown。
			
 
				 
			
 
				         参数:
			
 
				             result: MinerU API 返回的 JSON 数据
			
@@ -130,33 +141,42 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				         返回:
			
 
				             提取的 markdown 文本
			
 
				         """
			
 
				+        raw_content = None
			
 
				+        content_source = None
			
 
				+        
			
 
				         # 尝试多种可能的结果格式
			
 
				 
			
 
				         # 格式1: 直接返回 full_text 字段
			
 
				         if "full_text" in result:
			
 
				-            return result["full_text"]
			
 
				+            raw_content = result["full_text"]
			
 
				+            content_source = "full_text"
			
 
				 
			
 
				         # 格式2: data.full_text
			
 
				-        if "data" in result and isinstance(result["data"], dict):
			
 
				+        elif "data" in result and isinstance(result["data"], dict):
			
 
				             if "full_text" in result["data"]:
			
 
				-                return result["data"]["full_text"]
			
 
				+                raw_content = result["data"]["full_text"]
			
 
				+                content_source = "data.full_text"
			
 
				             # 格式3: data.markdown
			
 
				-            if "markdown" in result["data"]:
			
 
				-                return result["data"]["markdown"]
			
 
				+            elif "markdown" in result["data"]:
			
 
				+                raw_content = result["data"]["markdown"]
			
 
				+                content_source = "data.markdown"
			
 
				             # 格式4: data.content
			
 
				-            if "content" in result["data"]:
			
 
				-                return result["data"]["content"]
			
 
				+            elif "content" in result["data"]:
			
 
				+                raw_content = result["data"]["content"]
			
 
				+                content_source = "data.content"
			
 
				 
			
 
				         # 格式5: markdown 字段
			
 
				-        if "markdown" in result:
			
 
				-            return result["markdown"]
			
 
				+        elif "markdown" in result:
			
 
				+            raw_content = result["markdown"]
			
 
				+            content_source = "markdown"
			
 
				 
			
 
				         # 格式6: content 字段
			
 
				-        if "content" in result:
			
 
				-            return result["content"]
			
 
				+        elif "content" in result:
			
 
				+            raw_content = result["content"]
			
 
				+            content_source = "content"
			
 
				 
			
 
				         # 格式7: 遍历 pages 提取内容
			
 
				-        if "pages" in result:
			
 
				+        elif "pages" in result:
			
 
				             pages_text = []
			
 
				             for page in result["pages"]:
			
 
				                 if isinstance(page, dict):
			
@@ -167,17 +187,20 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				                     elif "content" in page:
			
 
				                         pages_text.append(page["content"])
			
 
				             if pages_text:
			
 
				-                return "\n\n".join(pages_text)
			
 
				+                raw_content = "\n\n".join(pages_text)
			
 
				+                content_source = "pages"
			
 
				 
			
 
				         # 格式8: 本地 MinerU API 格式
			
 
				         # {"results": {"filename": {"md_content": "..."}}}
			
 
				-        if "results" in result and isinstance(result["results"], dict):
			
 
				+        elif "results" in result and isinstance(result["results"], dict):
			
 
				             for filename, file_data in result["results"].items():
			
 
				                 if isinstance(file_data, dict) and "md_content" in file_data:
			
 
				-                    return file_data["md_content"]
			
 
				+                    raw_content = file_data["md_content"]
			
 
				+                    content_source = "results.md_content"
			
 
				+                    break
			
 
				 
			
 
				         # 格式9: results 列表
			
 
				-        if "results" in result and isinstance(result["results"], list):
			
 
				+        elif "results" in result and isinstance(result["results"], list):
			
 
				             texts = []
			
 
				             for item in result["results"]:
			
 
				                 if isinstance(item, dict):
			
@@ -188,10 +211,91 @@ class LocalMinerUFullTextExtractor(FullTextExtractor):
 
				                     elif "text" in item:
			
 
				                         texts.append(item["text"])
			
 
				             if texts:
			
 
				-                return "\n\n".join(texts)
			
 
				+                raw_content = "\n\n".join(texts)
			
 
				+                content_source = "results.list"
			
 
				 
			
 
				         # 如果都没找到，打印原始结果用于调试
			
 
				-        print("警告: 无法从 MinerU 结果中提取内容，返回空字符串")
			
 
				-        print(f"结果结构: {list(result.keys())}")
			
 
				-
			
 
				-        return ""
			
 
				+        if raw_content is None:
			
 
				+            print("警告: 无法从 MinerU 结果中提取内容，返回空字符串")
			
 
				+            print(f"结果结构: {list(result.keys())}")
			
 
				+            return ""
			
 
				+        
			
 
				+        # 检测并转换 HTML 格式
			
 
				+        if raw_content and self._is_html_content(raw_content):
			
 
				+            print(f"[INFO] 检测到 HTML 格式内容（来源: {content_source}），自动转换为 Markdown")
			
 
				+            raw_content = self._convert_html_to_markdown(raw_content)
			
 
				+        
			
 
				+        return raw_content
			
 
				+    
			
 
				+    def _is_html_content(self, content: str) -> bool:
			
 
				+        """
			
 
				+        检测内容是否为 HTML 格式
			
 
				+        
			
 
				+        通过检查是否包含常见的 HTML 标签来判断
			
 
				+        """
			
 
				+        if not content or not isinstance(content, str):
			
 
				+            return False
			
 
				+        
			
 
				+        # 检查是否包含常见的 HTML 标签
			
 
				+        html_tags_pattern = r'<(?:html|head|body|div|span|p|br|hr|table|tr|td|th|ul|ol|li|h[1-6]|b|i|em|strong|a|img|meta|title|link|script|style)[^>]*>'
			
 
				+        
			
 
				+        # 如果找到多个 HTML 标签，认为是 HTML 内容
			
 
				+        matches = re.findall(html_tags_pattern, content, re.IGNORECASE)
			
 
				+        
			
 
				+        # 至少找到 2 个 HTML 标签才认为是 HTML（减少误判）
			
 
				+        return len(matches) >= 2
			
 
				+    
			
 
				+    def _convert_html_to_markdown(self, html_content: str) -> str:
			
 
				+        """
			
 
				+        将 HTML 内容转换为 Markdown
			
 
				+        
			
 
				+        如果安装了 markdownify 则使用，否则使用简单降级方案
			
 
				+        """
			
 
				+        if HTML_CONVERTER_AVAILABLE:
			
 
				+            try:
			
 
				+                return convert_html_to_markdown(html_content)
			
 
				+            except Exception as e:
			
 
				+                print(f"[WARN] HTML 转 Markdown 失败: {e}，使用降级方案")
			
 
				+                return self._simple_html_to_text(html_content)
			
 
				+        else:
			
 
				+            print("[WARN] HTML 转换器不可用，使用简单文本提取")
			
 
				+            return self._simple_html_to_text(html_content)
			
 
				+    
			
 
				+    def _simple_html_to_text(self, html_content: str) -> str:
			
 
				+        """
			
 
				+        简单的 HTML 到文本转换（降级方案）
			
 
				+        """
			
 
				+        if not html_content:
			
 
				+            return ""
			
 
				+        
			
 
				+        # 移除 script 和 style 标签及其内容
			
 
				+        text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
			
 
				+        text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
			
 
				+        
			
 
				+        # 将常见块级标签转为换行
			
 
				+        text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
			
 
				+        text = re.sub(r'</p>', '\n\n', text, flags=re.IGNORECASE)
			
 
				+        text = re.sub(r'</div>', '\n', text, flags=re.IGNORECASE)
			
 
				+        text = re.sub(r'</tr>', '\n', text, flags=re.IGNORECASE)
			
 
				+        text = re.sub(r'</td>', ' ', text, flags=re.IGNORECASE)
			
 
				+        text = re.sub(r'</th>', ' ', text, flags=re.IGNORECASE)
			
 
				+        
			
 
				+        # 处理标题标签
			
 
				+        for i in range(6, 0, -1):
			
 
				+            text = re.sub(rf'<h{i}[^>]*>(.*?)</h{i}>', rf'{"#" * i} \1\n\n', text, flags=re.IGNORECASE | re.DOTALL)
			
 
				+        
			
 
				+        # 剥离所有剩余的 HTML 标签
			
 
				+        text = re.sub(r'<[^>]+>', '', text)
			
 
				+        
			
 
				+        # 清理 HTML 实体
			
 
				+        text = text.replace('&nbsp;', ' ')
			
 
				+        text = text.replace('&lt;', '<')
			
 
				+        text = text.replace('&gt;', '>')
			
 
				+        text = text.replace('&amp;', '&')
			
 
				+        text = text.replace('&quot;', '"')
			
 
				+        text = text.replace('&#39;', "'")
			
 
				+        
			
 
				+        # 清理多余空行
			
 
				+        text = re.sub(r'\n{3,}', '\n\n', text)
			
 
				+        
			
 
				+        return text.strip()
			
--- a/core/construction_review/component/doc_worker/utils/prompt_loader.py
+++ b/core/construction_review/component/doc_worker/utils/prompt_loader.py
@@ -56,9 +56,9 @@ class PromptLoader:
 
				         with self._csv_file.open("r", encoding="utf-8-sig") as f:  # 使用 utf-8-sig 自动处理 BOM
			
 
				             reader = csv.DictReader(f)
			
 
				             for row in reader:
			
 
				-                # 新CSV格式：first_contents_code, first_contents, second_contents_code, second_contents
			
 
				-                level1 = (row.get("first_contents") or "").strip()
			
 
				-                level2 = (row.get("second_contents") or "").strip()
			
 
				+                # CSV格式：first_code, first_name, second_code, second_name, ...
			
 
				+                level1 = (row.get("first_name") or "").strip()
			
 
				+                level2 = (row.get("second_name") or "").strip()
			
 
				                 
			
 
				                 # 跳过空的一级目录
			
 
				                 if not level1: