2 veckor sedan · 28eb485205
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 
															 FROM python:3.12-slim
														
 
															-# 安装 OpenCV 系统依赖（更完整的列表）
														
 
															+# 安装 OpenCV 系统依赖及 LibreOffice（docx/doc 转 PDF）
														
 
															 RUN apt-get update && apt-get install -y \
														
 
															     # OpenCV 核心依赖
														
 
															     libgl1 \
														
@@ -26,6 +26,12 @@ RUN apt-get update && apt-get install -y \
 
															     # 其他可能需要的库
														
 
															     libfontconfig1 \
														
 
															     libfreetype6 \
														
 
															+    # LibreOffice（用于 docx/doc 转 PDF）
														
 
															+    libreoffice-writer \
														
 
															+    libreoffice-core \
														
 
															+    # 中文字体（PDF 转换中文支持）
														
 
															+    fonts-wqy-zenhei \
														
 
															+    --no-install-recommends \
														
 
															     && rm -rf /var/lib/apt/lists/*
														
 
															 ENV DEBIAN_FRONTEND=noninteractive \
														
--- a/core/base/__init__.py
+++ b/core/base/__init__.py
@@ -1,27 +1,27 @@
 
															 """
														
 
															 文档分类切分库
														
 
															-支持PDF和Word文档的目录提取、智能分类和文本切分
														
 
															+支持PDF文档的目录提取、智能分类和文本切分
														
 
															 主要功能：
														
 
															-1. 提取PDF/Word文档的目录结构
														
 
															+1. 提取PDF文档的目录结构
														
 
															 2. 识别和校验目录的层级关系
														
 
															 3. 基于二级目录关键词匹配对一级目录进行智能分类
														
 
															 4. 按目录层级和字符数智能切分文本
														
 
															 5. 保存分类结果到多种格式
														
 
															 使用示例（当前推荐直接使用业务层封装的 DocumentProcessor，而不是底层分类器类）。
														
 
															+
														
 
															+注意: DOCX/DOC 文件应在上传层转换为 PDF，本模块不再直接处理 DOCX
														
 
															 """
														
 
															-__version__ = "2.0.0"
														
 
															+__version__ = "2.1.0"
														
 
															 __author__ = "Your Name"
														
 
															 from core.construction_review.component.doc_worker.interfaces import TOCExtractor, TextSplitter
														
 
															 from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
														
 
															 from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
														
 
															-from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
														
 
															 from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
														
 
															-from core.construction_review.component.doc_worker.docx_worker.text_splitter import DocxTextSplitter
														
 
															 __all__ = [
														
@@ -29,8 +29,6 @@ __all__ = [
 
															     'TextSplitter',
														
 
															     'HierarchyClassifier',
														
 
															     'PdfTOCExtractor',
														
 
															-    'DocxTOCExtractor',
														
 
															     'PdfTextSplitter',
														
 
															-    'DocxTextSplitter',
														
 
															 ]
														
--- a/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
+++ b/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
@@ -1,4 +1,4 @@
 
															-first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus,keywords
														
 
															+first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus,keywords
														
 
															 basis,编制依据,LawsAndRegulations,法律法规,NULL,NationalLawsAndRegulations,国家政府发布的法律法规与规章制度,国家级、法律、法规、规章、强制力、普遍适用、基础框架、顶层设计、行业准则、合规性、统一标准、权威性、强制性条文、基本要求。,国家法律;法规;规章;强制性条文;国务院令;住房城乡建设部;中华人民共和国
														
 
															 basis,编制依据,LawsAndRegulations,法律法规,NULL,ProvincialLawsAndRegulationsOfProjectLocation,工程所在地省级政府发布的法律法规与规章制度,地方性、区域性、细化补充、因地制宜、执行细则、地方特色、适应性要求、属地管理、动态调整、配套政策、本地化实施。,省级;地方法规;省政府;地方规章;属地管理;四川省;省人民政府
														
 
															 basis,编制依据,StandardsAndSpecifications,标准规范,NULL,IndustryStandards,行业标准,需符合国家/行业强制或推荐性标准（如GB/T、JTG等）、时效性强（需跟踪最新版）、覆盖全生命周期（设计→施工→运维）、是定义工程项目的最低技术要求、质量验收准则、安全红线。,GB/T;JTG;CJJ;行业标准;国家标准;推荐性标准;GB 5;TB;HJ;DL
														
@@ -56,7 +56,6 @@ technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规
 
															 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,TemporaryWaterAndElectricityConsumption,临时水电用量,需计算施工期间的用水、用电量（如“临时用水管径DN100”“临时用电容量500kW”）、用于临时设施的设计；,临时用水量;临时用电量;用水量;用电量;水电用量
														
 
															 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,TheSiteIsFlat,场地平整,需明确平整的范围、标高（如“平整场地至设计标高±0.000”“压实度达到90%”）、是施工场地准备的基础；,场地平整;整平场地;标高;压实度;平整
														
 
															 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,Staffing,人员配置,需列出各岗位的人员数量（如“项目经理1名”“施工员2名”“钢筋工10名”）、是劳动力管理的核心；,人员配置;岗位人员;项目经理;施工员;人员配备;人员分工
														
 
															-technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,EquipmentEntry,设备进场,需明确设备的进场时间、运输方式（如“塔式起重机进场时间2026年3月1日”“采用平板车运输”）、是设备准备的关键；,设备进场;进场时间;进场方式;进场日期;机械进场
														
 
															 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,SafetyProtectionFacilities,安全防护措施,需列出现场的安全设施（如“安全网”“防护栏杆”“消防栓”）、是安全保障的基础；,安全防护;安全网;防护栏杆;消防设施;安全设施;防护措施
														
 
															 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,PersonnelAccess,人员上下通道,需明确通道的形式、位置（如“楼梯间通道”“脚手架斜道”）、是人员通行的安全保障。,人员通道;上下通道;楼梯通道;斜道;人员上下;通道布置
														
 
															 technology,施工工艺技术,Process,工艺流程,工序专业名称类、工程名称类、数值类、数值单位类,ConstructionProcess,施工工序,需列出工程的主要工序（如“地基处理→基础浇筑→主体结构→装饰装修”）、是工艺流程的核心；,施工工序;主要工序;工序流程;施工顺序;工艺步骤
														
@@ -101,10 +100,10 @@ quality,质量保证措施,Excellence,工程创优规划,工程创优总体计
 
															 quality,质量保证措施,Excellence,工程创优规划,工程创优总体计划、技术准备（BIM/新技术应用）、过程控制（关键工序精品打造）、细部处理（节点优化）、精品工程创建、新技术推广（四新技术）、申报资料编制、工程资料归档、创优考核机制,NewTechnologyPromotion,新技术推广,需应用“四新技术”（新技术、新材料、新工艺、新设备）、提升创优的技术含量；,四新技术;新技术推广;新工艺;新材料;新设备;技术创新应用
														
 
															 quality,质量保证措施,Excellence,工程创优规划,工程创优总体计划、技术准备（BIM/新技术应用）、过程控制（关键工序精品打造）、细部处理（节点优化）、精品工程创建、新技术推广（四新技术）、申报资料编制、工程资料归档、创优考核机制,PreparationOfApplicationMaterials,申报资料编制,需整理创优所需的资料（如工程质量报告、技术创新成果）、是创优申报的核心材料；,申报资料;创优申报;工程质量报告;申报材料
														
 
															 quality,质量保证措施,Excellence,工程创优规划,工程创优总体计划、技术准备（BIM/新技术应用）、过程控制（关键工序精品打造）、细部处理（节点优化）、精品工程创建、新技术推广（四新技术）、申报资料编制、工程资料归档、创优考核机制,EngineeringDataArchiving,工程资料归档,需确保资料真实、完整、符合创优评审要求。,工程资料归档;档案管理;竣工资料;资料归档
														
 
															-quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料进场检验（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,RawMaterialInspection,原材料进场检验,需执行“三证一检”（合格证、质检报告、生产许可证+进场复检）、确保材料质量；,原材料进场;三证一检;材料检验;复检报告;进场材料质量
														
 
															-quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料进场检验（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,PhysicalProjectQualityAcceptance,实体工程质量验收,需按分项（如“钢筋绑扎”）、分部工程（如“基础工程”）进行验收、符合规范要求；,实体验收;分项验收;分部验收;实体工程验收;工程质量验收
														
 
															-quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料进场检验（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,PreventionAndControlOfCommonQualityDefectsInProcesses,工序质量通病防治,需针对常见问题（如“墙面空鼓”“屋面渗漏”）制定专项措施（如“抹灰前基层凿毛”“防水附加层施工”）、减少质量缺陷；,质量通病;空鼓;渗漏;裂缝;蜂窝麻面;防治措施;通病防治
														
 
															-quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料进场检验（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,SeasonalConstructionQualityAssuranceMeasures,季节性施工质量保证措施,需针对冬期（混凝土保温）、雨期（防水加强）、高温（混凝土保湿）制定专项措施、确保施工质量；,季节性施工;冬期施工;雨期施工;高温施工;夏季施工;冬季混凝土
														
 
															+quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料检查验收（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,RawMaterialInspection,原材料检查验收,需执行“三证一检”（合格证、质检报告、生产许可证+进场复检）、确保材料质量；,原材料进场;三证一检;材料检验;复检报告;进场材料质量
														
 
															+quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料检查验收（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,PhysicalProjectQualityAcceptance,实体工程质量验收,需按分项（如“钢筋绑扎”）、分部工程（如“基础工程”）进行验收、符合规范要求；,实体验收;分项验收;分部验收;实体工程验收;工程质量验收
														
 
															+quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料检查验收（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,PreventionAndControlOfCommonQualityDefectsInProcesses,工序质量通病防治,需针对常见问题（如“墙面空鼓”“屋面渗漏”）制定专项措施（如“抹灰前基层凿毛”“防水附加层施工”）、减少质量缺陷；,质量通病;空鼓;渗漏;裂缝;蜂窝麻面;防治措施;通病防治
														
 
															+quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料检查验收（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,SeasonalConstructionQualityAssuranceMeasures,季节性施工质量保证措施,需针对冬期（混凝土保温）、雨期（防水加强）、高温（混凝土保湿）制定专项措施、确保施工质量；,季节性施工;冬期施工;雨期施工;高温施工;夏季施工;冬季混凝土
														
 
															 environment,环境保证措施,EnvSystem,环境保证体系,环境保证体系框图、公司标准体系引用,BlockDiagramOfEnvironmentalAssuranceSystem,环境保证体系框图,环境保证体系的视觉化呈现、需明确体系的核心要素（如组织机构、制度流程、资源保障）及逻辑关系、是公司标准体系的具象化载体；,环境保证体系;环境管理体系框图;环境保证体系框图
														
 
															 environment,环境保证措施,EnvSystem,环境保证体系,环境保证体系框图、公司标准体系引用,CompanyStandardSystemReference,公司标准体系引用,应引用公司标准体系框图、强调环境保证体系需承接公司现有标准（如《公司环境管理体系手册》《公司环境保护管理办法》）、确保体系的一致性与延续性；,环境管理体系;环境保护管理办法;公司环境标准;环境体系引用
														
 
															 environment,环境保证措施,EnvOrg,环境保护组织机构,环境保护组织架构、管理人员姓名、管理人员职务、管理人员职责、环境管理岗位责任、责任考核机制、环境管理职责分工、环境管理人员资质、环境管理沟通机制,EnvironmentalAssuranceSystemFramework,环境保护组织架构,包含管理人员姓名、职务、职责、环境管理的责任主体、基于项目经理为组长的工作领导小组、小组中包括项目经理、项目副经理、项目总工、工程部门、质检部门、安全环保部门、专业分包单位（协作队伍）项目负责人和项目技术负责人等、需明确机构的层级（如公司级、项目级、班组级）及组成部门（如环境部、工程部、技术部）、形成“横向到边、纵向到底”的管理网络；,环境保护组织;环境管理机构;环境管理组织架构;环境领导小组
														
--- a/core/construction_review/component/doc_worker/docx_worker/__init__.py
+++ b/core/construction_review/component/doc_worker/docx_worker/__init__.py
@@ -1,17 +0,0 @@
 
															-"""
														
 
															-DOCX 文档处理模块
														
 
															-
														
 
															-提供 DOCX 文件的目录提取、全文提取、文本切分等功能。
														
 
															-"""
														
 
															-
														
 
															-from .pipeline import DocxPipeline
														
 
															-from .toc_extractor import DocxTOCExtractor
														
 
															-from .full_text_extractor import DocxFullTextExtractor
														
 
															-from .text_splitter import DocxTextSplitter
														
 
															-
														
 
															-__all__ = [
														
 
															-    "DocxPipeline",
														
 
															-    "DocxTOCExtractor",
														
 
															-    "DocxFullTextExtractor",
														
 
															-    "DocxTextSplitter",
														
 
															-]
														
--- a/core/construction_review/component/doc_worker/docx_worker/cli.py
+++ b/core/construction_review/component/doc_worker/docx_worker/cli.py
@@ -1,118 +0,0 @@
 
															-"""
														
 
															-DOCX 处理命令行接口
														
 
															-
														
 
															-用法示例：
														
 
															-  python -m file_parse.docx_worker.cli input.docx
														
 
															-  python -m file_parse.docx_worker.cli input.docx -l 1 --max-size 3000 --min-size 50
														
 
															-  python -m file_parse.docx_worker.cli input.docx -o ./output
														
 
															-"""
														
 
															-
														
 
															-import argparse
														
 
															-import json
														
 
															-import sys
														
 
															-from datetime import datetime
														
 
															-from pathlib import Path
														
 
															-
														
 
															-from ..interfaces import DocumentSource
														
 
															-from .pipeline import DocxPipeline
														
 
															-
														
 
															-
														
 
															-def main():
														
 
															-    parser = argparse.ArgumentParser(description="DOCX 文档处理工具")
														
 
															-    parser.add_argument("docx_path", help="输入 DOCX 文件路径")
														
 
															-    parser.add_argument(
														
 
															-        "-l", "--level",
														
 
															-        type=int,
														
 
															-        help="目标层级（默认从配置读取）"
														
 
															-    )
														
 
															-    parser.add_argument(
														
 
															-        "--max-size",
														
 
															-        type=int,
														
 
															-        help="最大块大小（默认从配置读取）"
														
 
															-    )
														
 
															-    parser.add_argument(
														
 
															-        "--min-size",
														
 
															-        type=int,
														
 
															-        help="最小块大小（默认从配置读取）"
														
 
															-    )
														
 
															-    parser.add_argument(
														
 
															-        "-o", "--output",
														
 
															-        help="输出目录（默认为 ./output）"
														
 
															-    )
														
 
															-    
														
 
															-    args = parser.parse_args()
														
 
															-
														
 
															-    # 检查文件是否存在
														
 
															-    docx_path = Path(args.docx_path)
														
 
															-    if not docx_path.exists():
														
 
															-        print(f"错误：文件不存在 -> {docx_path}", file=sys.stderr)
														
 
															-        sys.exit(1)
														
 
															-
														
 
															-    # 创建输出目录
														
 
															-    output_dir = Path(args.output) if args.output else Path("./output")
														
 
															-    output_dir.mkdir(parents=True, exist_ok=True)
														
 
															-
														
 
															-    # 创建文档源
														
 
															-    source = DocumentSource(path=docx_path, file_type="docx")
														
 
															-
														
 
															-    # 运行处理流程
														
 
															-    try:
														
 
															-        pipeline = DocxPipeline()
														
 
															-        result = pipeline.run(
														
 
															-            source,
														
 
															-            target_level=args.level,
														
 
															-            max_chunk_size=args.max_size,
														
 
															-            min_chunk_size=args.min_size,
														
 
															-        )
														
 
															-    except Exception as e:
														
 
															-        print(f"处理失败：{e}", file=sys.stderr)
														
 
															-        import traceback
														
 
															-        traceback.print_exc()
														
 
															-        sys.exit(1)
														
 
															-
														
 
															-    # 生成输出文件名
														
 
															-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
														
 
															-    base_name = docx_path.stem
														
 
															-    output_file = output_dir / f"{base_name}_完整结果_{timestamp}.json"
														
 
															-
														
 
															-    # 构建完整输出结构
														
 
															-    output_data = {
														
 
															-        "source_file": str(docx_path.absolute()),
														
 
															-        "process_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
														
 
															-        "toc_summary": {
														
 
															-            "total_items": result["toc_info"]["toc_count"],
														
 
															-            "toc_pages": result["toc_info"]["toc_pages"],
														
 
															-        },
														
 
															-        "complete_toc_list": [
														
 
															-            {
														
 
															-                "index": i + 1,
														
 
															-                "title": item["title"],
														
 
															-                "page": item["page"],
														
 
															-                "level": item["level"],
														
 
															-                "original": item["original"],
														
 
															-            }
														
 
															-            for i, item in enumerate(result["toc_info"]["toc_items"])
														
 
															-        ],
														
 
															-        "classification_summary": {
														
 
															-            "target_level": result["meta"]["target_level"],
														
 
															-            "total_count": result["classification"]["total_count"],
														
 
															-            "categories": result["classification"].get("category_stats", {}),
														
 
															-        },
														
 
															-        "classified_items": result["classification"]["items"],
														
 
															-        "chunks": result["chunks"],
														
 
															-        "meta": result["meta"],
														
 
															-    }
														
 
															-
														
 
															-    # 写入文件
														
 
															-    with output_file.open("w", encoding="utf-8") as f:
														
 
															-        json.dump(output_data, f, ensure_ascii=False, indent=2)
														
 
															-
														
 
															-    print(f"\n处理完成！")
														
 
															-    print(f"  输出文件: {output_file}")
														
 
															-    print(f"  目录项数: {result['toc_info']['toc_count']}")
														
 
															-    print(f"  分类项数: {result['classification']['total_count']}")
														
 
															-    print(f"  文本块数: {len(result['chunks'])}")
														
 
															-
														
 
															-
														
 
															-if __name__ == "__main__":
														
 
															-    main()
														
--- a/core/construction_review/component/doc_worker/docx_worker/full_text_extractor.py
+++ b/core/construction_review/component/doc_worker/docx_worker/full_text_extractor.py
@@ -1,110 +0,0 @@
 
															-"""
														
 
															-DOCX 全文提取实现
														
 
															-
														
 
															-提取 DOCX 文档的全文内容，按段落组织，模拟分页。
														
 
															-"""
														
 
															-
														
 
															-from __future__ import annotations
														
 
															-
														
 
															-import re
														
 
															-from io import BytesIO
														
 
															-from pathlib import Path
														
 
															-from typing import Any, Dict, List
														
 
															-
														
 
															-from docx import Document
														
 
															-
														
 
															-from ..interfaces import FullTextExtractor, DocumentSource
														
 
															-
														
 
															-
														
 
															-class DocxFullTextExtractor(FullTextExtractor):
														
 
															-    """DOCX 全文提取器"""
														
 
															-
														
 
															-    def __init__(self, paragraphs_per_page: int = 30):
														
 
															-        """
														
 
															-        初始化
														
 
															-        
														
 
															-        Args:
														
 
															-            paragraphs_per_page: 每页段落数（用于模拟分页）
														
 
															-        """
														
 
															-        self.paragraphs_per_page = paragraphs_per_page
														
 
															-
														
 
															-    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        提取 DOCX 文档的全文内容
														
 
															-        
														
 
															-        返回结构：
														
 
															-        [
														
 
															-            {
														
 
															-                "page_num": int,
														
 
															-                "text": str,
														
 
															-                "start_pos": int,
														
 
															-                "end_pos": int,
														
 
															-                "source_file": str,
														
 
															-            },
														
 
															-            ...
														
 
															-        ]
														
 
															-        """
														
 
															-        # 加载文档
														
 
															-        if source.path:
														
 
															-            doc = Document(source.path)
														
 
															-            source_file = str(source.path)
														
 
															-        elif source.content:
														
 
															-            doc = Document(BytesIO(source.content))
														
 
															-            source_file = "bytes_stream"
														
 
															-        else:
														
 
															-            raise ValueError("DocumentSource 必须提供 path 或 content")
														
 
															-
														
 
															-        # 按照文档中的实际顺序提取段落和表格
														
 
															-        # 创建段落和表格的元素到对象的映射
														
 
															-        para_map = {para._element: para for para in doc.paragraphs}
														
 
															-        table_map = {table._element: table for table in doc.tables}
														
 
															-        
														
 
															-        # 按照文档中的顺序遍历所有元素
														
 
															-        all_elements = []
														
 
															-        for element in doc.element.body:
														
 
															-            if element in para_map:
														
 
															-                # 段落元素
														
 
															-                para = para_map[element]
														
 
															-                text = para.text
														
 
															-                # 过滤目录行：标题\t页码（页码部分支持带修饰符号）
														
 
															-                # 匹配从开头开始，包含制表符且末尾有数字的模式（目录行特征）
														
 
															-                if text and not re.match(r"^.+\t+.*?\d+.*?\s*$", text):
														
 
															-                    all_elements.append(text)
														
 
															-            elif element in table_map:
														
 
															-                # 表格元素
														
 
															-                table = table_map[element]
														
 
															-                table_text = self._extract_table_text(table)
														
 
															-                all_elements.append(table_text)
														
 
															-
														
 
															-        # 模拟分页：每 N 个元素作为一页
														
 
															-        pages_content = []
														
 
															-        current_pos = 0
														
 
															-        
														
 
															-        # 正则表达式：匹配 [表格开始]...任意内容...[表格结束] 模式
														
 
															-        table_placeholder_pattern = re.compile(
														
 
															-            r'\n?\[表格开始\]\n.*?\n\[表格结束\]\n?',
														
 
															-            re.DOTALL
														
 
															-        )
														
 
															-        
														
 
															-        for page_num in range(0, len(all_elements), self.paragraphs_per_page):
														
 
															-            page_elements = all_elements[page_num:page_num + self.paragraphs_per_page]
														
 
															-            page_text = "\n".join(page_elements)
														
 
															-            
														
 
															-            # 将任何可能存在的 [表格开始]...表格内容...[表格结束] 替换为占位符
														
 
															-            page_text = table_placeholder_pattern.sub('\n<表格></表格>\n', page_text)
														
 
															-            
														
 
															-            pages_content.append({
														
 
															-                "page_num": page_num // self.paragraphs_per_page + 1,
														
 
															-                "text": page_text,
														
 
															-                "start_pos": current_pos,
														
 
															-                "end_pos": current_pos + len(page_text),
														
 
															-                "source_file": source_file,
														
 
															-            })
														
 
															-            
														
 
															-            current_pos += len(page_text)
														
 
															-
														
 
															-        return pages_content
														
 
															-
														
 
															-    def _extract_table_text(self, table) -> str:
														
 
															-        """提取表格占位符，不提取实际内容"""
														
 
															-        return "\n<表格></表格>\n"
														
--- a/core/construction_review/component/doc_worker/docx_worker/pipeline.py
+++ b/core/construction_review/component/doc_worker/docx_worker/pipeline.py
@@ -1,106 +0,0 @@
 
															-"""
														
 
															-DOCX 文档处理流程
														
 
															-
														
 
															-整合目录提取、分类、全文提取、文本切分等步骤。
														
 
															-"""
														
 
															-
														
 
															-from __future__ import annotations
														
 
															-
														
 
															-from pathlib import Path
														
 
															-from typing import Any, Dict, Optional
														
 
															-
														
 
															-from ..interfaces import DocumentPipeline, DocumentSource
														
 
															-from ..config.provider import default_config_provider
														
 
															-from ..classification.hierarchy_classifier import HierarchyClassifier
														
 
															-
														
 
															-from .toc_extractor import DocxTOCExtractor
														
 
															-from .full_text_extractor import DocxFullTextExtractor
														
 
															-from .text_splitter import DocxTextSplitter
														
 
															-
														
 
															-
														
 
															-class DocxPipeline(DocumentPipeline):
														
 
															-    """DOCX 文档处理流水线"""
														
 
															-
														
 
															-    def __init__(self):
														
 
															-        self._cfg = default_config_provider
														
 
															-        self._toc_extractor = DocxTOCExtractor()
														
 
															-        self._full_text_extractor = DocxFullTextExtractor(
														
 
															-            paragraphs_per_page=int(self._cfg.get("toc_extraction.paragraphs_per_page", 30))
														
 
															-        )
														
 
															-        self._text_splitter = DocxTextSplitter()
														
 
															-        self._classifier = HierarchyClassifier()
														
 
															-
														
 
															-    def run(
														
 
															-        self,
														
 
															-        source: DocumentSource,
														
 
															-        target_level: Optional[int] = None,
														
 
															-        max_chunk_size: Optional[int] = None,
														
 
															-        min_chunk_size: Optional[int] = None,
														
 
															-    ) -> Dict[str, Any]:
														
 
															-        """
														
 
															-        运行完整流程
														
 
															-        
														
 
															-        返回：
														
 
															-        {
														
 
															-            "toc_info": {...},
														
 
															-            "classification": {...},
														
 
															-            "chunks": [...],
														
 
															-            "meta": {...},
														
 
															-        }
														
 
															-        """
														
 
															-        # 从配置获取默认值
														
 
															-        if target_level is None:
														
 
															-            target_level = int(self._cfg.get("text_splitting.target_level", 1))
														
 
															-        if max_chunk_size is None:
														
 
															-            max_chunk_size = int(self._cfg.get("text_splitting.max_chunk_size", 3000))
														
 
															-        if min_chunk_size is None:
														
 
															-            min_chunk_size = int(self._cfg.get("text_splitting.min_chunk_size", 50))
														
 
															-
														
 
															-        print(f"开始处理 DOCX 文档...")
														
 
															-        print(f"  目标层级: {target_level}")
														
 
															-        print(f"  最大块大小: {max_chunk_size}")
														
 
															-        print(f"  最小块大小: {min_chunk_size}")
														
 
															-
														
 
															-        # 步骤1: 提取目录
														
 
															-        print("\n步骤1: 提取目录...")
														
 
															-        toc_info = self._toc_extractor.extract_toc(source)
														
 
															-        print(f"  提取到 {toc_info['toc_count']} 个目录项")
														
 
															-
														
 
															-        # 步骤2: 分类目录项
														
 
															-        print("\n步骤2: 分类目录项...")
														
 
															-        classification = self._classifier.classify(toc_info["toc_items"], target_level)
														
 
															-        print(f"  分类完成，共 {classification['total_count']} 个目标层级项")
														
 
															-
														
 
															-        # 步骤3: 提取全文
														
 
															-        print("\n步骤3: 提取全文...")
														
 
															-        pages_content = self._full_text_extractor.extract_full_text(source)
														
 
															-        print(f"  提取到 {len(pages_content)} 页内容")
														
 
															-
														
 
															-        # 步骤4: 切分文本
														
 
															-        print("\n步骤4: 切分文本...")
														
 
															-        chunks = self._text_splitter.split_by_hierarchy(
														
 
															-            classification["items"],
														
 
															-            pages_content,
														
 
															-            toc_info,
														
 
															-            target_level,
														
 
															-            max_chunk_size,
														
 
															-            min_chunk_size,
														
 
															-        )
														
 
															-        print(f"  切分完成，共 {len(chunks)} 个块")
														
 
															-
														
 
															-        # 填充文件名
														
 
															-        file_name = Path(source.path).name if source.path else "unknown.docx"
														
 
															-        for chunk in chunks:
														
 
															-            chunk["file_name"] = file_name
														
 
															-
														
 
															-        return {
														
 
															-            "toc_info": toc_info,
														
 
															-            "classification": classification,
														
 
															-            "chunks": chunks,
														
 
															-            "meta": {
														
 
															-                "target_level": target_level,
														
 
															-                "max_chunk_size": max_chunk_size,
														
 
															-                "min_chunk_size": min_chunk_size,
														
 
															-                "file_type": "docx",
														
 
															-            },
														
 
															-        }
														
--- a/core/construction_review/component/doc_worker/docx_worker/text_splitter.py
+++ b/core/construction_review/component/doc_worker/docx_worker/text_splitter.py
@@ -1,327 +0,0 @@
 
															-"""
														
 
															-DOCX 文本切分实现
														
 
															-
														
 
															-复刻 PDF 处理的切分逻辑：
														
 
															-1. 跳过目录页，只在正文中定位章节标题
														
 
															-2. 按最低目录层级进行切分，形成章节块
														
 
															-3. 对超过最大字符数的块按段落-句子进行再次切分，保持语义完整性
														
 
															-"""
														
 
															-
														
 
															-from __future__ import annotations
														
 
															-
														
 
															-from typing import Any, Dict, List
														
 
															-
														
 
															-from ..config.provider import default_config_provider
														
 
															-from ..interfaces import TextSplitter
														
 
															-from ..utils.title_matcher import TitleMatcher
														
 
															-from ..utils.text_split_support import HierarchicalChunkMixin
														
 
															-
														
 
															-
														
 
															-class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
														
 
															-    """按目录层级对 DOCX 正文进行智能分块的实现"""
														
 
															-
														
 
															-    def __init__(self) -> None:
														
 
															-        self._cfg = default_config_provider
														
 
															-        self._title_matcher = TitleMatcher()
														
 
															-
														
 
															-    def split_by_hierarchy(
														
 
															-        self,
														
 
															-        classification_items: List[Dict[str, Any]],
														
 
															-        pages_content: List[Dict[str, Any]],
														
 
															-        toc_info: Dict[str, Any],
														
 
															-        target_level: int,
														
 
															-        max_chunk_size: int,
														
 
															-        min_chunk_size: int,
														
 
															-    ) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        按目录层级和字符数智能切分文本
														
 
															-        
														
 
															-        逻辑与 PDF 处理完全一致
														
 
															-        """
														
 
															-        toc_pages = toc_info.get("toc_pages", []) or []
														
 
															-        all_toc_items = toc_info.get("toc_items", [])
														
 
															-        
														
 
															-        # 使用完整全文
														
 
															-        full_text = "".join(p.get("text", "") for p in pages_content)
														
 
															-
														
 
															-        print(f"  正在定位{len(classification_items)}个已分类的标题...")
														
 
															-        print(f"  目录所在页: {toc_pages}")
														
 
															-
														
 
															-        # 步骤1: 在正文中定位已分类的标题（跳过目录页）
														
 
															-        located = self._title_matcher.find_title_positions(
														
 
															-            classification_items, full_text, pages_content, toc_pages
														
 
															-        )
														
 
															-        
														
 
															-        # 只保留成功定位的标题
														
 
															-        found_titles = [t for t in located if t["found"]]
														
 
															-        if not found_titles:
														
 
															-            print(f"  错误: 未能在正文中定位任何标题")
														
 
															-            return []
														
 
															-
														
 
															-        print(f"  成功定位 {len(found_titles)}/{len(classification_items)} 个标题")
														
 
															-        
														
 
															-        # 按位置排序
														
 
															-        found_titles.sort(key=lambda x: x["position"])
														
 
															-
														
 
															-        # 步骤2: 构建一级目录标题到分类信息的映射
														
 
															-        chapter_classification_map: Dict[str, Dict[str, Any]] = {}
														
 
															-        for item in classification_items:
														
 
															-            if item.get("level") == 1:
														
 
															-                chapter_title = item.get("title", "")
														
 
															-                chapter_classification_map[chapter_title] = {
														
 
															-                    "category": item.get("category", ""),
														
 
															-                    "category_code": item.get("category_code", "other"),
														
 
															-                    "page": item.get("page", ""),
														
 
															-                    "level": item.get("level", 1),
														
 
															-                }
														
 
															-
														
 
															-        # 步骤3: 为每个找到的标题构建完整的层级路径
														
 
															-        for title_info in found_titles:
														
 
															-            hierarchy_path = self._build_hierarchy_path(
														
 
															-                title_info["title"], all_toc_items, target_level
														
 
															-            )
														
 
															-            title_info["hierarchy_path"] = hierarchy_path
														
 
															-
														
 
															-        # 步骤4: 按目录层级处理每个标题块
														
 
															-        all_chunks: List[Dict[str, Any]] = []
														
 
															-        
														
 
															-        for i, title_info in enumerate(found_titles):
														
 
															-            start_pos = title_info["position"]
														
 
															-            
														
 
															-            # 确定正文块的结束位置（下一个同级标题的位置）
														
 
															-            if i + 1 < len(found_titles):
														
 
															-                end_pos = found_titles[i + 1]["position"]
														
 
															-            else:
														
 
															-                end_pos = len(full_text)
														
 
															-            
														
 
															-            # 提取正文块
														
 
															-            content_block = full_text[start_pos:end_pos]
														
 
															-            
														
 
															-            # 在正文块中查找子标题（按最低层级切分）
														
 
															-            sub_chunks = self._split_by_sub_titles(
														
 
															-                content_block,
														
 
															-                all_toc_items,
														
 
															-                title_info,
														
 
															-                target_level,
														
 
															-                max_chunk_size,
														
 
															-                min_chunk_size,
														
 
															-            )
														
 
															-            
														
 
															-            # 为每个子块添加元数据
														
 
															-            for j, sub_chunk in enumerate(sub_chunks, 1):
														
 
															-                chunk_data = self._build_chunk_metadata(
														
 
															-                    sub_chunk, title_info, start_pos, pages_content, i, j, chapter_classification_map
														
 
															-                )
														
 
															-                all_chunks.append(chunk_data)
														
 
															-
														
 
															-        # 步骤4: 生成最终的chunk_id和serial_number
														
 
															-        final_chunks = self._finalize_chunk_ids(all_chunks)
														
 
															-
														
 
															-        print(f"  初始切分: {len(all_chunks)} 个块")
														
 
															-        print(f"  最终块数: {len(final_chunks)} 个块")
														
 
															-
														
 
															-        return final_chunks
														
 
															-
														
 
															-    def _split_by_sub_titles(
														
 
															-        self,
														
 
															-        content_block: str,
														
 
															-        all_toc_items: List[Dict[str, Any]],
														
 
															-        parent_title_info: Dict[str, Any],
														
 
															-        target_level: int,
														
 
															-        max_chunk_size: int,
														
 
															-        min_chunk_size: int,
														
 
															-    ) -> List[Dict[str, Any]]:
														
 
															-        """
														
 
															-        在正文块中按子标题进行切分（按照toc_items的顺序和层级关系）
														
 
															-        
														
 
															-        核心逻辑：
														
 
															-        1. 查找所有层级的子标题（不限于直接子标题）
														
 
															-        2. 按位置排序后，两个相邻子标题之间的内容作为一个块
														
 
															-        3. 只有当块超过 max_chunk_size 时才按句子切分
														
 
															-        """
														
 
															-        # 找到父标题在toc_items中的位置
														
 
															-        parent_title = parent_title_info["title"]
														
 
															-        parent_idx = -1
														
 
															-        parent_level = target_level
														
 
															-        
														
 
															-        for idx, toc_item in enumerate(all_toc_items):
														
 
															-            if toc_item["title"] == parent_title:
														
 
															-                parent_idx = idx
														
 
															-                parent_level = toc_item.get("level", target_level)
														
 
															-                break
														
 
															-
														
 
															-        if parent_idx < 0:
														
 
															-            # 如果找不到父标题，将整个正文块作为一个块
														
 
															-            if len(content_block) > max_chunk_size:
														
 
															-                return self._split_large_chunk(content_block, max_chunk_size, parent_title, [])
														
 
															-            else:
														
 
															-                return [
														
 
															-                    {
														
 
															-                        "content": content_block,
														
 
															-                        "relative_start": 0,
														
 
															-                        "sub_title": "",
														
 
															-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
														
 
															-                    }
														
 
															-                ]
														
 
															-
														
 
															-        # 找到下一个同级或更高级标题的位置（确定父标题的范围）
														
 
															-        next_sibling_idx = len(all_toc_items)
														
 
															-        for idx in range(parent_idx + 1, len(all_toc_items)):
														
 
															-            item = all_toc_items[idx]
														
 
															-            if item.get("level", 1) <= parent_level:
														
 
															-                next_sibling_idx = idx
														
 
															-                break
														
 
															-
														
 
															-        # 查找所有子标题（所有 level > parent_level 的标题）
														
 
															-        # 这是关键：不限于直接子标题，而是所有更深层级的标题
														
 
															-        all_sub_titles = []
														
 
															-        fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
														
 
															-
														
 
															-        for idx in range(parent_idx + 1, next_sibling_idx):
														
 
															-            toc_item = all_toc_items[idx]
														
 
															-            item_level = toc_item.get("level", 1)
														
 
															-            
														
 
															-            # 查找所有更深层级的子标题
														
 
															-            if item_level > parent_level:
														
 
															-                # 在正文块中查找这个子标题
														
 
															-                pos = self._find_title_in_block(
														
 
															-                    toc_item["title"], content_block, fuzzy_threshold
														
 
															-                )
														
 
															-                if pos >= 0:
														
 
															-                    # 调试：显示找到的标题及其周围内容
														
 
															-                    context_start = max(0, pos - 20)
														
 
															-                    context_end = min(len(content_block), pos + len(toc_item["title"]) + 50)
														
 
															-                    context = content_block[context_start:context_end].replace("\n", " ")
														
 
															-                    print(f"        找到子标题: {toc_item['title']} (level={item_level}), 位置={pos}, 上下文: ...{context}...")
														
 
															-                    
														
 
															-                    all_sub_titles.append(
														
 
															-                        {
														
 
															-                            "title": toc_item["title"],
														
 
															-                            "level": toc_item["level"],
														
 
															-                            "position": pos,
														
 
															-                            "toc_index": idx,
														
 
															-                            "toc_item": toc_item,
														
 
															-                        }
														
 
															-                    )
														
 
															-
														
 
															-        # 按位置排序
														
 
															-        all_sub_titles.sort(key=lambda x: x["position"])
														
 
															-
														
 
															-        # 如果没有找到任何子标题，将整个正文块作为一个块
														
 
															-        if not all_sub_titles:
														
 
															-            if len(content_block) > max_chunk_size:
														
 
															-                return self._split_large_chunk(
														
 
															-                    content_block, max_chunk_size, parent_title, 
														
 
															-                    parent_title_info.get("hierarchy_path", [parent_title])
														
 
															-                )
														
 
															-            else:
														
 
															-                return [
														
 
															-                    {
														
 
															-                        "content": content_block,
														
 
															-                        "relative_start": 0,
														
 
															-                        "sub_title": "",
														
 
															-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
														
 
															-                    }
														
 
															-                ]
														
 
															-
														
 
															-        # 找到直接子标题（parent_level + 1）和所有更深层级的标题
														
 
															-        direct_child_level = parent_level + 1
														
 
															-        direct_child_titles = [sub for sub in all_sub_titles if sub["level"] == direct_child_level]
														
 
															-        
														
 
															-        # 找到最低层级（用于判断哪些是最底层的标题）
														
 
															-        max_level = max(sub["level"] for sub in all_sub_titles) if all_sub_titles else parent_level
														
 
															-        
														
 
															-        print(f"      父标题: {parent_title}, 找到 {len(all_sub_titles)} 个子标题, 直接子标题数: {len(direct_child_titles)}, 最低层级: {max_level}")
														
 
															-
														
 
															-        # 如果没有直接子标题，但有更深层级的标题，使用最低层级标题切分（保持向后兼容）
														
 
															-        if not direct_child_titles and all_sub_titles:
														
 
															-            lowest_level_titles = [sub for sub in all_sub_titles if sub["level"] == max_level]
														
 
															-            print(f"      没有直接子标题，使用最低层级标题切分: {len(lowest_level_titles)} 个")
														
 
															-            direct_child_titles = lowest_level_titles
														
 
															-
														
 
															-        # 按直接子标题切分（如果存在）
														
 
															-        chunks = []
														
 
															-        if direct_child_titles:
														
 
															-            for i, sub_title in enumerate(direct_child_titles):
														
 
															-                start_pos = sub_title["position"]
														
 
															-
														
 
															-                # 确定结束位置（下一个同级或更高级标题的位置）
														
 
															-                # 在 all_sub_titles 中查找下一个位置大于当前标题，且 level <= direct_child_level 的标题
														
 
															-                end_pos = len(content_block)
														
 
															-                for next_sub in all_sub_titles:
														
 
															-                    if next_sub["position"] > start_pos and next_sub["level"] <= direct_child_level:
														
 
															-                        end_pos = next_sub["position"]
														
 
															-                        break
														
 
															-
														
 
															-                chunk_content = content_block[start_pos:end_pos]
														
 
															-                
														
 
															-                # 调试信息
														
 
															-                content_preview = chunk_content[:100].replace("\n", " ")
														
 
															-                print(f"        切分块 {i+1}: {sub_title['title']} (level={sub_title['level']}), 位置: {start_pos}-{end_pos}, 长度: {len(chunk_content)}, 预览: {content_preview}...")
														
 
															-
														
 
															-                # 检查子标题是否有实际正文内容
														
 
															-                title_len = len(sub_title["title"])
														
 
															-                content_after_title = chunk_content[title_len:].strip()
														
 
															-
														
 
															-                if not content_after_title or len(content_after_title) < 10:
														
 
															-                    print(f"        跳过（内容不足）")
														
 
															-                    continue
														
 
															-
														
 
															-                # 构建层级路径
														
 
															-                hierarchy_path = self._build_hierarchy_path_for_subtitle(
														
 
															-                    sub_title["toc_item"], all_toc_items, parent_title_info
														
 
															-                )
														
 
															-
														
 
															-                # 只有当块超过 max_chunk_size 时才按句子切分
														
 
															-                if len(chunk_content) > max_chunk_size:
														
 
															-                    print(f"        块过大，按句子切分")
														
 
															-                    split_chunks = self._split_large_chunk(
														
 
															-                        chunk_content, max_chunk_size, sub_title["title"], hierarchy_path
														
 
															-                    )
														
 
															-                    for split_chunk in split_chunks:
														
 
															-                        split_chunk["relative_start"] = start_pos + split_chunk["relative_start"]
														
 
															-                        split_chunk["sub_title"] = sub_title["title"]
														
 
															-                        if "hierarchy_path" not in split_chunk:
														
 
															-                            split_chunk["hierarchy_path"] = hierarchy_path
														
 
															-                        chunks.append(split_chunk)
														
 
															-                else:
														
 
															-                    # 直接作为一个块
														
 
															-                    chunks.append(
														
 
															-                        {
														
 
															-                            "content": chunk_content,
														
 
															-                            "relative_start": start_pos,
														
 
															-                            "sub_title": sub_title["title"],
														
 
															-                            "hierarchy_path": hierarchy_path,
														
 
															-                        }
														
 
															-                    )
														
 
															-
														
 
															-        # 如果所有子标题都没有正文内容，返回整个正文块
														
 
															-        if not chunks:
														
 
															-            if len(content_block) > max_chunk_size:
														
 
															-                return self._split_large_chunk(
														
 
															-                    content_block, max_chunk_size, parent_title,
														
 
															-                    parent_title_info.get("hierarchy_path", [parent_title])
														
 
															-                )
														
 
															-            else:
														
 
															-                return [
														
 
															-                    {
														
 
															-                        "content": content_block,
														
 
															-                        "relative_start": 0,
														
 
															-                        "sub_title": "",
														
 
															-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
														
 
															-                    }
														
 
															-                ]
														
 
															-
														
 
															-        return chunks
														
 
															-
														
 
															-    def _find_title_in_block(self, title: str, block: str, fuzzy_threshold: float) -> int:
														
 
															-        """在文本块中查找标题位置（简化版）"""
														
 
															-        # 直接使用 TitleMatcher 的方法
														
 
															-        return self._title_matcher._find_title_in_text(title, block, fuzzy_threshold)
														
 
															-
														
 
															-    def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
														
 
															-        """根据位置获取页码"""
														
 
															-        for page in pages_content:
														
 
															-            if page["start_pos"] <= pos < page["end_pos"]:
														
 
															-                return int(page["page_num"])
														
 
															-        return 1
														
--- a/core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
+++ b/core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
@@ -1,123 +0,0 @@
 
															-"""
														
 
															-DOCX 目录提取实现
														
 
															-
														
 
															-参考 docx_toc_detector.py 的逻辑，识别目录行（标题 + 制表符 + 页码）。
														
 
															-"""
														
 
															-
														
 
															-from __future__ import annotations
														
 
															-
														
 
															-import re
														
 
															-from pathlib import Path
														
 
															-from typing import Any, Dict, List
														
 
															-
														
 
															-from docx import Document
														
 
															-
														
 
															-from ..interfaces import TOCExtractor, DocumentSource
														
 
															-from ..utils.toc_level_identifier import TOCLevelIdentifier
														
 
															-from ..utils.toc_pattern_matcher import TOCPatternMatcher
														
 
															-
														
 
															-
														
 
															-class DocxTOCExtractor(TOCExtractor):
														
 
															-    """DOCX 目录提取器"""
														
 
															-
														
 
															-    # 目录行模式：标题 + 制表符 + 页码（页码部分支持带修饰符号，如 ‐ 19 ‐）
														
 
															-    TOC_PATTERN = re.compile(r"^(?P<title>.+?)\t+(?P<page>.*?\d+.*?)\s*$")
														
 
															-
														
 
															-    def __init__(self) -> None:
														
 
															-        """初始化 DOCX 目录提取器"""
														
 
															-        self._level_identifier = TOCLevelIdentifier()
														
 
															-        self._page_extractor = TOCPatternMatcher()
														
 
															-
														
 
															-    def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
														
 
															-        """
														
 
															-        提取 DOCX 文档的目录信息
														
 
															-        
														
 
															-        返回结构：
														
 
															-        {
														
 
															-            "toc_items": [{"title": str, "page": int, "level": int, "original": str}, ...],
														
 
															-            "toc_count": int,
														
 
															-            "toc_pages": List[int],
														
 
															-        }
														
 
															-        """
														
 
															-        # 加载文档
														
 
															-        if source.path:
														
 
															-            doc = Document(source.path)
														
 
															-        elif source.content:
														
 
															-            from io import BytesIO
														
 
															-            doc = Document(BytesIO(source.content))
														
 
															-        else:
														
 
															-            raise ValueError("DocumentSource 必须提供 path 或 content")
														
 
															-
														
 
															-        # 提取目录行
														
 
															-        toc_items = []
														
 
															-        toc_pages_set = set()
														
 
															-        
														
 
															-        for para in doc.paragraphs:
														
 
															-            text = para.text.strip()
														
 
															-            if "\t" not in text:
														
 
															-                continue
														
 
															-            
														
 
															-            match = self.TOC_PATTERN.match(text)
														
 
															-            if match:
														
 
															-                title = match.group("title").strip()
														
 
															-                page_raw = match.group("page").strip()
														
 
															-                
														
 
															-                # 从可能带有修饰符号的页码中提取纯数字
														
 
															-                page_num_str = self._page_extractor.extract_page_number(page_raw)
														
 
															-                try:
														
 
															-                    page = int(page_num_str)
														
 
															-                except ValueError:
														
 
															-                    # 如果无法转换为整数，跳过该项
														
 
															-                    continue
														
 
															-                
														
 
															-                # 先不设置层级，后续统一识别
														
 
															-                toc_items.append({
														
 
															-                    "title": title,
														
 
															-                    "page": page,
														
 
															-                    "original": text,
														
 
															-                })
														
 
															-                
														
 
															-                toc_pages_set.add(page)
														
 
															-
														
 
															-        # 估算目录所在页（假设目录在前几页）
														
 
															-        if toc_items:
														
 
															-            # 目录页通常是目录项中最小页码之前的页
														
 
															-            min_content_page = min(item["page"] for item in toc_items)
														
 
															-            toc_pages = list(range(1, min(min_content_page, 10)))
														
 
															-        else:
														
 
															-            toc_pages = []
														
 
															-
														
 
															-        # 使用 TOCLevelIdentifier 识别层级（与 doc_worker 保持一致）
														
 
															-        toc_items = self._level_identifier.identify_levels(toc_items)
														
 
															-
														
 
															-        return {
														
 
															-            "toc_items": toc_items,
														
 
															-            "toc_count": len(toc_items),
														
 
															-            "toc_pages": toc_pages,
														
 
															-        }
														
 
															-
														
 
															-    def _detect_level(self, title: str) -> int:
														
 
															-        """
														
 
															-        根据标题格式检测层级（已废弃，保留仅用于向后兼容）
														
 
															-        
														
 
															-        注意：此方法已不再使用，现在使用 TOCLevelIdentifier 统一识别层级。
														
 
															-        保留此方法仅用于向后兼容和测试。
														
 
															-        """
														
 
															-        # 章节格式
														
 
															-        if re.match(r"^第[一二三四五六七八九十\d]+章", title):
														
 
															-            return 1
														
 
															-        
														
 
															-        # 中文编号 + 右括号
														
 
															-        if re.match(r"^[一二三四五六七八九十]+[）)]", title):
														
 
															-            return 2
														
 
															-        
														
 
															-        # 数字 + 顿号/句号
														
 
															-        if re.match(r"^\d+[、．.]", title):
														
 
															-            return 3
														
 
															-        
														
 
															-        # 括号数字
														
 
															-        if re.match(r"^[\(（]\d+[\)）]", title):
														
 
															-            return 4
														
 
															-        
														
 
															-        # 默认 level 2
														
 
															-        return 2
														
--- a/core/construction_review/component/doc_worker/docx_worker/命令
+++ b/core/construction_review/component/doc_worker/docx_worker/命令
@@ -1 +0,0 @@
 
															-python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝（四川境）高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output
														
--- a/core/construction_review/component/doc_worker/utils/text_split_support.py
+++ b/core/construction_review/component/doc_worker/utils/text_split_support.py
@@ -114,7 +114,7 @@ class HierarchicalChunkMixin:
 
															     """
														
 
															     分级目录切分的通用工具 Mixin。
														
 
															-    把原先 `PdfTextSplitter` / `DocxTextSplitter` 中完全相同的
														
 
															+    把原先 `PdfTextSplitter` 中完全相同的
														
 
															     chunk 元数据构造、层级路径、编号提取等方法抽到这里，
														
 
															     便于多种 worker 复用。
														
 
															     """
														
--- a/core/construction_review/component/doc_worker/命令
+++ b/core/construction_review/component/doc_worker/命令
@@ -1,10 +0,0 @@
 
															-python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝（四川境）高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output
														
 
															-python -m core.construction_review.component.doc_worker.pdf_worker.cli "E:\LLM\dev_v1\files\7a88f0d5-9d82-43bf-b2b1-c2924d67477e.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
														
 
															-
														
 
															-
														
 
															-
														
 
															-python -m file_parse.pdf_worker.cli "Z:\施工方案及编制依据案例库（第一阶段）1205\施工方案文档列表\44_四川公路桥梁建设集团有限公司镇巴（川陕界）至广安高速公路通广段C合同段C4项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
														
 
															-
														
 
															-
														
 
															-
														
 
															-python -m doc_worker.pdf_worker.cli "data\44_四川公路桥梁建设集团有限公司镇巴（川陕界）至广安高速公路通广段C合同段C4项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
														
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -5,9 +5,11 @@
 
															 重构说明:
														
 
															 1. 使用类级别共享ChunkClassifier实例，避免重复创建LLM客户端
														
 
															-2. 统一PDF/DOCX处理流程，消除代码重复
														
 
															+2. 统一PDF处理流程，消除代码重复
														
 
															 3. 移除splits冗余数据，统一使用chunks
														
 
															 4. 完善异常处理，记录完整堆栈信息
														
 
															+
														
 
															+注意: DOCX/DOC 文件应在上传层转换为 PDF，本模块不再直接处理 DOCX
														
 
															 """
														
 
															 import io
														
@@ -31,9 +33,6 @@ try:
 
															     from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
														
 
															     from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
														
 
															     from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
														
 
															-    from .doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
														
 
															-    from .doc_worker.docx_worker.full_text_extractor import DocxFullTextExtractor
														
 
															-    from .doc_worker.docx_worker.text_splitter import DocxTextSplitter
														
 
															     from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
														
 
															     from .doc_worker.classification.chunk_classifier import ChunkClassifier
														
 
															     from .doc_worker.config.provider import default_config_provider
														
@@ -43,9 +42,6 @@ except ImportError:
 
															     from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
														
 
															     from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
														
 
															     from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
														
 
															-    from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
														
 
															-    from core.construction_review.component.doc_worker.docx_worker.full_text_extractor import DocxFullTextExtractor
														
 
															-    from core.construction_review.component.doc_worker.docx_worker.text_splitter import DocxTextSplitter
														
 
															     from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
														
 
															     from core.construction_review.component.doc_worker.classification.chunk_classifier import ChunkClassifier
														
 
															     from core.construction_review.component.doc_worker.config.provider import default_config_provider
														
@@ -158,7 +154,7 @@ class DocumentProcessor:
 
															     _shared_chunk_classifier: Optional[ChunkClassifier] = None
														
 
															     def __init__(self, progress_manager=None, callback_task_id: str = None, progress_state: dict = None):
														
 
															-        self.supported_types = ['pdf', 'docx']
														
 
															+        self.supported_types = ['pdf']  # DOCX/DOC 应在上传层转换为 PDF
														
 
															         self.config = default_config_provider
														
 
															         # SSE 进度推送（由 DocumentWorkflow 注入）
														
 
															         self._progress_manager = progress_manager
														
@@ -166,24 +162,54 @@ class DocumentProcessor:
 
															         # 与心跳协程共享的状态字典，更新后心跳自动反映新阶段
														
 
															         self._progress_state = progress_state
														
 
															-        # 初始化各类型文档的处理组件
														
 
															+        # 初始化PDF文档的处理组件
														
 
															         self._components: Dict[str, DocumentComponents] = {
														
 
															             'pdf': DocumentComponents(
														
 
															                 toc_extractor=PdfTOCExtractor(),
														
 
															                 classifier=PdfHierarchyClassifier(),
														
 
															                 fulltext_extractor=HybridFullTextExtractor(),
														
 
															                 text_splitter=PdfTextSplitter()
														
 
															-            ),
														
 
															-            'docx': DocumentComponents(
														
 
															-                toc_extractor=DocxTOCExtractor(),
														
 
															-                classifier=DocxHierarchyClassifier(),
														
 
															-                fulltext_extractor=DocxFullTextExtractor(
														
 
															-                    paragraphs_per_page=int(self.config.get("toc_extraction.paragraphs_per_page", 30))
														
 
															-                ),
														
 
															-                text_splitter=DocxTextSplitter()
														
 
															             )
														
 
															         }
														
 
															+        # 加载标准分类表并创建序号映射
														
 
															+        self._load_category_seq_mappings()
														
 
															+
														
 
															+    def _load_category_seq_mappings(self):
														
 
															+        """加载标准分类表CSV，创建code到seq的映射"""
														
 
															+        self._first_seq_map: Dict[str, int] = {}  # first_code -> first_seq
														
 
															+        self._second_seq_map: Dict[str, int] = {}  # second_code -> second_seq
														
 
															+
														
 
															+        try:
														
 
															+            import csv
														
 
															+            csv_path = Path(__file__).parent / 'doc_worker' / 'config' / 'StandardCategoryTable.csv'
														
 
															+            if not csv_path.exists():
														
 
															+                logger.warning(f"标准分类表不存在: {csv_path}")
														
 
															+                return
														
 
															+
														
 
															+            with open(csv_path, 'r', encoding='utf-8-sig') as f:
														
 
															+                reader = csv.DictReader(f)
														
 
															+                for row in reader:
														
 
															+                    first_code = row.get('first_code', '').strip()
														
 
															+                    second_code = row.get('second_code', '').strip()
														
 
															+                    try:
														
 
															+                        first_seq = int(row.get('first_seq', 0) or 0)
														
 
															+                    except (ValueError, TypeError):
														
 
															+                        first_seq = 0
														
 
															+                    try:
														
 
															+                        second_seq = int(row.get('second_seq', 0) or 0)
														
 
															+                    except (ValueError, TypeError):
														
 
															+                        second_seq = 0
														
 
															+
														
 
															+                    if first_code and first_code not in self._first_seq_map:
														
 
															+                        self._first_seq_map[first_code] = first_seq
														
 
															+                    if second_code and second_code not in self._second_seq_map:
														
 
															+                        self._second_seq_map[second_code] = second_seq
														
 
															+
														
 
															+            logger.debug(f"加载分类序号映射: 一级 {len(self._first_seq_map)} 个, 二级 {len(self._second_seq_map)} 个")
														
 
															+        except Exception as e:
														
 
															+            logger.warning(f"加载分类序号映射失败: {e}")
														
 
															+
														
 
															     @classmethod
														
 
															     def _get_chunk_classifier(cls) -> ChunkClassifier:
														
 
															         """获取共享的ChunkClassifier实例"""
														
@@ -456,10 +482,6 @@ class DocumentProcessor:
 
															             }
														
 
															         }
														
 
															-        # DOCX额外保留full_text字段
														
 
															-        if file_type == 'docx':
														
 
															-            result['full_text'] = ''.join([page.get('text', '') for page in pages_content])
														
 
															-
														
 
															         return result
														
 
															     async def _fallback_processing(self, file_content: bytes, file_type: str) -> Dict[str, Any]:
														
@@ -468,15 +490,12 @@ class DocumentProcessor:
 
															         Args:
														
 
															             file_content: 文件内容
														
 
															-            file_type: 文件类型（pdf/docx）
														
 
															+            file_type: 文件类型（仅支持 pdf）
														
 
															         Returns:
														
 
															             Dict: 基础处理结果
														
 
															         """
														
 
															-        if file_type == 'pdf':
														
 
															-            return await self._fallback_pdf_processing(file_content)
														
 
															-        else:
														
 
															-            return await self._fallback_docx_processing(file_content)
														
 
															+        return await self._fallback_pdf_processing(file_content)
														
 
															     async def _fallback_pdf_processing(self, file_content: bytes) -> Dict[str, Any]:
														
 
															         """PDF基础处理模式（当智能处理失败时使用）"""
														
@@ -533,46 +552,6 @@ class DocumentProcessor:
 
															             logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
														
 
															             raise
														
 
															-    async def _fallback_docx_processing(self, file_content: bytes) -> Dict[str, Any]:
														
 
															-        """DOCX基础处理模式（当智能处理失败时使用）"""
														
 
															-        try:
														
 
															-            from docx import Document
														
 
															-            from io import BytesIO
														
 
															-
														
 
															-            logger.info("使用基础DOCX处理模式（内存模式）")
														
 
															-            doc = Document(BytesIO(file_content))
														
 
															-            full_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
														
 
															-
														
 
															-            # 简单分块，并过滤空内容
														
 
															-            chunks = []
														
 
															-            chunk_size = 1000
														
 
															-            chunk_index = 1
														
 
															-            for i in range(0, len(full_text), chunk_size):
														
 
															-                chunk_text = full_text[i:i+chunk_size].strip()
														
 
															-                if chunk_text:
														
 
															-                    chunks.append({
														
 
															-                        'chunk_id': f'chunk_{chunk_index}',
														
 
															-                        'content': chunk_text,
														
 
															-                        'metadata': {'chunk_index': chunk_index}
														
 
															-                    })
														
 
															-                    chunk_index += 1
														
 
															-
														
 
															-            logger.info(f"基础处理完成，有效分块数量: {len(chunks)}")
														
 
															-
														
 
															-            return {
														
 
															-                'document_type': 'docx',
														
 
															-                'total_chunks': len(chunks),
														
 
															-                'full_text': full_text,
														
 
															-                'chunks': chunks,
														
 
															-                'metadata': {
														
 
															-                    'paragraphs_count': len(doc.paragraphs),
														
 
															-                    'word_count': len(full_text.split())
														
 
															-                }
														
 
															-            }
														
 
															-        except Exception as e:
														
 
															-            logger.error(f"基础DOCX处理失败: {str(e)}", exc_info=True)
														
 
															-            raise
														
 
															-
														
 
															     def structure_content(self, raw_content: Dict[str, Any]) -> Dict[str, Any]:
														
 
															         """结构化处理，适配doc_worker返回的格式"""
														
 
															         try:
														
@@ -589,6 +568,12 @@ class DocumentProcessor:
 
															                     if content:
														
 
															                         metadata = chunk.get('metadata', {})
														
 
															                         element_tag = metadata.get('element_tag', {})
														
 
															+                        chapter_classification = metadata.get('chapter_classification', '')
														
 
															+                        secondary_category_code = metadata.get('secondary_category_code', '')
														
 
															+
														
 
															+                        # 获取序号
														
 
															+                        first_seq = self._first_seq_map.get(chapter_classification, 0)
														
 
															+                        second_seq = self._second_seq_map.get(secondary_category_code, 0)
														
 
															                         chunks.append({
														
 
															                             'chunk_id': metadata.get('chunk_id', ''),
														
@@ -596,9 +581,11 @@ class DocumentProcessor:
 
															                             'content': content,
														
 
															                             'section_label': metadata.get('section_label', ''),
														
 
															                             'project_plan_type': metadata.get('project_plan_type', ''),
														
 
															-                            'chapter_classification': metadata.get('chapter_classification', ''),
														
 
															+                            'chapter_classification': chapter_classification,
														
 
															+                            'first_seq': first_seq,
														
 
															                             'secondary_category_cn': metadata.get('secondary_category_cn', ''),
														
 
															-                            'secondary_category_code': metadata.get('secondary_category_code', ''),
														
 
															+                            'secondary_category_code': secondary_category_code,
														
 
															+                            'second_seq': second_seq,
														
 
															                             'tertiary_category_cn': metadata.get('tertiary_category_cn', ''),
														
 
															                             'tertiary_category_code': metadata.get('tertiary_category_code', ''),
														
 
															                             # 三级分类详情列表（包含该二级分类下的所有三级分类）
														
@@ -625,17 +612,8 @@ class DocumentProcessor:
 
															                                 'original_content': content[:100] + '...' if len(content) > 100 else content
														
 
															                             })
														
 
															                 else:
														
 
															-                    # DOCX基础处理
														
 
															-                    all_chunks = raw_content.get('chunks', [])
														
 
															+                    # 基础处理结果为空
														
 
															                     chunks = []
														
 
															-                    for chunk in all_chunks:
														
 
															-                        content = chunk.get('content', '').strip()
														
 
															-                        if content:
														
 
															-                            chunks.append({
														
 
															-                                'chunk_id': chunk.get('chunk_id', f'chunk_{len(chunks)+1}'),
														
 
															-                                'content': content,
														
 
															-                                'metadata': chunk.get('metadata', {})
														
 
															-                            })
														
 
															             # 构建返回结果
														
 
															             result = {
														
--- a/core/construction_review/component/reviewers/completeness_reviewer.py
+++ b/core/construction_review/component/reviewers/completeness_reviewer.py
@@ -27,6 +27,9 @@ class TertiaryItem:
 
															     second_cn: str
														
 
															     third_cn: str
														
 
															     third_focus: str
														
 
															+    first_seq: int = 0
														
 
															+    second_seq: int = 0
														
 
															+    third_seq: int = 0
														
 
															 @dataclass
														
@@ -36,6 +39,8 @@ class SecondaryItem:
 
															     second_code: str
														
 
															     first_cn: str
														
 
															     second_cn: str
														
 
															+    first_seq: int = 0
														
 
															+    second_seq: int = 0
														
 
															 @dataclass
														
@@ -100,6 +105,20 @@ class TertiarySpecLoader:
 
															                 third_cn = str(row.get('third_name', '')).strip()
														
 
															                 third_focus = str(row.get('third_focus', '')).strip()
														
 
															+                # 读取序号字段
														
 
															+                try:
														
 
															+                    first_seq = int(row.get('first_seq', 0) or 0)
														
 
															+                except (ValueError, TypeError):
														
 
															+                    first_seq = 0
														
 
															+                try:
														
 
															+                    second_seq = int(row.get('second_seq', 0) or 0)
														
 
															+                except (ValueError, TypeError):
														
 
															+                    second_seq = 0
														
 
															+                try:
														
 
															+                    third_seq = int(row.get('third_seq', 0) or 0)
														
 
															+                except (ValueError, TypeError):
														
 
															+                    third_seq = 0
														
 
															+
														
 
															                 # 动态构建一级分类名称映射
														
 
															                 if first_code and first_cn and first_code not in self.first_names:
														
 
															                     self.first_names[first_code] = first_cn
														
@@ -113,7 +132,10 @@ class TertiarySpecLoader:
 
															                     first_cn=first_cn or self.first_names.get(first_code, first_code),
														
 
															                     second_cn=second_cn,
														
 
															                     third_cn=third_cn,
														
 
															-                    third_focus=third_focus
														
 
															+                    third_focus=third_focus,
														
 
															+                    first_seq=first_seq,
														
 
															+                    second_seq=second_seq,
														
 
															+                    third_seq=third_seq
														
 
															                 )
														
 
															                 # 存储二级项
														
@@ -123,7 +145,9 @@ class TertiarySpecLoader:
 
															                         first_code=first_code,
														
 
															                         second_code=second_code,
														
 
															                         first_cn=first_cn or self.first_names.get(first_code, first_code),
														
 
															-                        second_cn=second_cn
														
 
															+                        second_cn=second_cn,
														
 
															+                        first_seq=first_seq,
														
 
															+                        second_seq=second_seq
														
 
															                     )
														
 
															         except Exception as e:
														
@@ -383,13 +407,19 @@ class LightweightCompletenessChecker:
 
															         extra_second = actual_second_keys - required_second
														
 
															         # 一级缺失详情
														
 
															-        missing_first_details = [
														
 
															-            {
														
 
															+        missing_first_details = []
														
 
															+        for c in sorted(missing_first):
														
 
															+            # 从任意该一级下的二级获取 first_seq
														
 
															+            first_seq = 0
														
 
															+            for (fc, sc), item in self.secondary_specs.items():
														
 
															+                if fc == c:
														
 
															+                    first_seq = item.first_seq
														
 
															+                    break
														
 
															+            missing_first_details.append({
														
 
															                 "first_code": c,
														
 
															-                "first_name": self.spec_loader.first_names.get(c, c)
														
 
															-            }
														
 
															-            for c in sorted(missing_first)
														
 
															-        ]
														
 
															+                "first_name": self.spec_loader.first_names.get(c, c),
														
 
															+                "first_seq": first_seq
														
 
															+            })
														
 
															         # 二级缺失详情
														
 
															         missing_second_details = []
														
@@ -398,8 +428,10 @@ class LightweightCompletenessChecker:
 
															             missing_second_details.append({
														
 
															                 "first_code": cat1,
														
 
															                 "first_name": item.first_cn if item else self.spec_loader.first_names.get(cat1, cat1),
														
 
															+                "first_seq": item.first_seq if item else 0,
														
 
															                 "secondary_code": cat2,
														
 
															-                "secondary_name": item.second_cn if item else "未知"
														
 
															+                "secondary_name": item.second_cn if item else "未知",
														
 
															+                "second_seq": item.second_seq if item else 0
														
 
															             })
														
 
															         # 二级多余详情（目录有但标准无）
														
@@ -409,8 +441,10 @@ class LightweightCompletenessChecker:
 
															             extra_second_details.append({
														
 
															                 "first_code": cat1,
														
 
															                 "first_name": self.spec_loader.first_names.get(cat1, cat1),
														
 
															+                "first_seq": item.first_seq if item else 0,
														
 
															                 "secondary_code": cat2,
														
 
															                 "secondary_name": item.second_cn if item else "未知",
														
 
															+                "second_seq": item.second_seq if item else 0,
														
 
															                 "outline_title": outline_secondary.get((cat1, cat2), "")
														
 
															             })
														
@@ -480,10 +514,13 @@ class LightweightCompletenessChecker:
 
															                 missing_details.append({
														
 
															                     "first_code": cat1,
														
 
															                     "first_name": item.first_cn,
														
 
															+                    "first_seq": item.first_seq,
														
 
															                     "secondary_code": cat2,
														
 
															                     "secondary_name": item.second_cn,
														
 
															+                    "second_seq": item.second_seq,
														
 
															                     "tertiary_code": cat3,
														
 
															                     "tertiary_name": item.third_cn,
														
 
															+                    "third_seq": item.third_seq,
														
 
															                     "focus": item.third_focus
														
 
															                 })
														
@@ -508,8 +545,10 @@ class LightweightCompletenessChecker:
 
															             secondary_stats_list.append({
														
 
															                 "first_code": cat1,
														
 
															                 "first_name": item.first_cn if item else self.spec_loader.first_names.get(cat1, cat1),
														
 
															+                "first_seq": item.first_seq if item else 0,
														
 
															                 "secondary_code": cat2,
														
 
															                 "secondary_name": item.second_cn if item else "未知",
														
 
															+                "second_seq": item.second_seq if item else 0,
														
 
															                 "total_tertiary": stats["total"],
														
 
															                 "present": stats["present"],
														
 
															                 "missing": stats["missing"],
														
@@ -631,6 +670,12 @@ class LightweightCompletenessChecker:
 
															         for first_code in sorted(required_first):
														
 
															             first_name = self.spec_loader.first_names.get(first_code, first_code)
														
 
															+            # 获取一级序号
														
 
															+            first_seq = 0
														
 
															+            for (fc, sc), item in self.secondary_specs.items():
														
 
															+                if fc == first_code:
														
 
															+                    first_seq = item.first_seq
														
 
															+                    break
														
 
															             # ── 一级缺失 ──────────────────────────────────────────────
														
 
															             if first_code not in actual_first:
														
@@ -643,6 +688,7 @@ class LightweightCompletenessChecker:
 
															                         f"根据规范要求，文档必须包含'{first_name}'一级章节，"
														
 
															                         f"当前正文中未发现该章节任何内容"
														
 
															                     ),
														
 
															+                    "first_seq": first_seq,
														
 
															                 })
														
 
															                 continue
														
@@ -653,6 +699,7 @@ class LightweightCompletenessChecker:
 
															             for (cat1, cat2) in required_second:
														
 
															                 sec_item = self.secondary_specs.get((cat1, cat2))
														
 
															                 second_name = sec_item.second_cn if sec_item else cat2
														
 
															+                second_seq = sec_item.second_seq if sec_item else 0
														
 
															                 # ── 二级缺失 ──────────────────────────────────────────
														
 
															                 if (cat1, cat2) not in actual_secondary:
														
@@ -667,6 +714,8 @@ class LightweightCompletenessChecker:
 
															                             f"根据规范要求，'{first_name}'下应包含'{second_name}'二级章节，"
														
 
															                             f"当前正文中未发现该章节内容"
														
 
															                         ),
														
 
															+                        "first_seq": first_seq,
														
 
															+                        "second_seq": second_seq,
														
 
															                     })
														
 
															                     continue
														
@@ -685,29 +734,20 @@ class LightweightCompletenessChecker:
 
															                 if not missing_t_items:
														
 
															                     continue
														
 
															-                n = len(missing_t_items)
														
 
															-
														
 
															-                # 缺失名称列表（最多展示 5 条）
														
 
															-                missing_labels = [
														
 
															-                    f"{i + 1}.{t.third_cn}" for i, t in enumerate(missing_t_items[:5])
														
 
															-                ]
														
 
															-                if n > 5:
														
 
															-                    missing_labels.append(f"等共{n}项")
														
 
															-                missing_str = "、".join(missing_labels)
														
 
															-
														
 
															-                recommendations.append({
														
 
															-                    "level": "三级",
														
 
															-                    "issue_point": (
														
 
															-                        f"【三级内容缺失】{first_name} > {second_name} 缺少{n}个三级要点：{missing_str}"
														
 
															-                    ),
														
 
															-                    "location": f"{first_name} > {second_name}",
														
 
															-                    "suggestion": (
														
 
															-                        f"请补充'{second_name}'以下{n}个要点内容：{missing_str}"
														
 
															-                    ),
														
 
															-                    "reason": (
														
 
															-                        f"'{second_name}'下缺失以下{n}个规范要求的内容要点：{missing_str}"
														
 
															-                    ),
														
 
															-                })
														
 
															+                # 为每个缺失的三级项创建单独的 recommendation
														
 
															+                for t_item in missing_t_items:
														
 
															+                    recommendations.append({
														
 
															+                        "level": "三级",
														
 
															+                        "issue_point": (
														
 
															+                            f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
														
 
															+                        ),
														
 
															+                        "location": f"{first_name} > {second_name}",
														
 
															+                        "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
														
 
															+                        "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
														
 
															+                        "first_seq": first_seq,
														
 
															+                        "second_seq": second_seq,
														
 
															+                        "third_seq": t_item.third_seq,
														
 
															+                    })
														
 
															         # ── 一致性审查：目录有列但正文无内容 ─────────────────────────────
														
 
															         if outline_result:
														
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/category_loaders.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/category_loaders.py
@@ -102,11 +102,14 @@ class CategoryStandardLoader:
 
															                 self.standards.append(CategoryStandard(
														
 
															                     first_code=row.get('first_code', ''),
														
 
															                     first_name=row.get('first_name', ''),
														
 
															+                    first_seq=int(row.get('first_seq', '0') or 0),
														
 
															                     second_code=row.get('second_code', ''),
														
 
															                     second_name=row.get('second_name', ''),
														
 
															+                    second_seq=int(row.get('second_seq', '0') or 0),
														
 
															                     second_focus=row.get('second_focus', ''),
														
 
															                     third_code=row.get('third_code', ''),
														
 
															                     third_name=row.get('third_name', ''),
														
 
															+                    third_seq=int(row.get('third_seq', '0') or 0),
														
 
															                     third_focus=row.get('third_focus', ''),
														
 
															                     keywords=row.get('keywords', '')
														
 
															                 ))
														
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/content_classifier.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/content_classifier.py
@@ -219,6 +219,7 @@ class ContentClassifierClient:
 
															             default_contents.append(ClassifiedContent(
														
 
															                 third_category_name=std.third_name,
														
 
															                 third_category_code=std.third_code,
														
 
															+                third_seq=std.third_seq,
														
 
															                 start_line=start_line,
														
 
															                 end_line=end_line,
														
 
															                 content=content
														
@@ -466,11 +467,11 @@ class ContentClassifierClient:
 
															             # 支持两种键名: classified_contents 或 classified_contents_list
														
 
															             items = data.get("classified_contents", []) or data.get("classified_contents_list", [])
														
 
															-            # 构建索引映射表：索引 -> (third_name, third_code)
														
 
															-            index_mapping = {0: ("非标准项", "no_standard")}
														
 
															+            # 构建索引映射表：索引 -> (third_name, third_code, third_seq)
														
 
															+            index_mapping = {0: ("非标准项", "no_standard", 0)}
														
 
															             if section.category_standards:
														
 
															                 for i, std in enumerate(section.category_standards, 1):
														
 
															-                    index_mapping[i] = (std.third_name, std.third_code)
														
 
															+                    index_mapping[i] = (std.third_name, std.third_code, std.third_seq)
														
 
															             for item in items:
														
 
															                 start_line = item.get("start_line", 0)
														
@@ -479,9 +480,9 @@ class ContentClassifierClient:
 
															                 # 优先使用 category_index 进行映射
														
 
															                 category_index = item.get("category_index")
														
 
															                 if category_index is not None:
														
 
															-                    # 通过索引映射获取标准名称和代码
														
 
															+                    # 通过索引映射获取标准名称、代码和序号
														
 
															                     idx = int(category_index) if isinstance(category_index, (int, float, str)) else 0
														
 
															-                    category_name, category_code = index_mapping.get(idx, ("非标准项", "no_standard"))
														
 
															+                    category_name, category_code, category_seq = index_mapping.get(idx, ("非标准项", "no_standard", 0))
														
 
															                 else:
														
 
															                     # 兼容旧格式：直接读取 third_category_code 和 third_category_name
														
 
															                     category_code = item.get("third_category_code", "")
														
@@ -503,6 +504,7 @@ class ContentClassifierClient:
 
															                 contents.append(ClassifiedContent(
														
 
															                     third_category_name=category_name,
														
 
															                     third_category_code=category_code,
														
 
															+                    third_seq=category_seq,
														
 
															                     start_line=start_line,
														
 
															                     end_line=end_line,
														
 
															                     content=content
														
@@ -552,6 +554,7 @@ class ContentClassifierClient:
 
															                     contents.append(ClassifiedContent(
														
 
															                         third_category_name=category_name,
														
 
															                         third_category_code=category_code,
														
 
															+                        third_seq=0,
														
 
															                         start_line=start_line,
														
 
															                         end_line=end_line,
														
 
															                         content=content
														
@@ -614,6 +617,7 @@ class ContentClassifierClient:
 
															                 merged_contents.append(ClassifiedContent(
														
 
															                     third_category_name=group_contents[0].third_category_name,
														
 
															                     third_category_code=category_code,
														
 
															+                    third_seq=group_contents[0].third_seq,
														
 
															                     start_line=range_info['start'],
														
 
															                     end_line=range_info['end'],
														
 
															                     content=merged_content
														
@@ -771,6 +775,7 @@ class ContentClassifierClient:
 
															                 supplemented.append(ClassifiedContent(
														
 
															                     third_category_name=std.third_name,
														
 
															                     third_category_code=std.third_code,
														
 
															+                    third_seq=std.third_seq,
														
 
															                     start_line=start,
														
 
															                     end_line=end,
														
 
															                     content=content
														
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/main_classifier.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/main_classifier.py
@@ -166,6 +166,7 @@ class LLMContentClassifier:
 
															                     {
														
 
															                         "third_category_name": c.third_category_name,
														
 
															                         "third_category_code": c.third_category_code,
														
 
															+                        "third_seq": c.third_seq,
														
 
															                         "start_line": c.start_line,
														
 
															                         "end_line": c.end_line,
														
 
															                         "content": c.content
														
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/models.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/models.py
@@ -13,11 +13,14 @@ class CategoryStandard:
 
															     """标准分类定义"""
														
 
															     first_code: str
														
 
															     first_name: str
														
 
															+    first_seq: int  # 一级序号
														
 
															     second_code: str
														
 
															     second_name: str
														
 
															+    second_seq: int  # 二级序号
														
 
															     second_focus: str  # 二级分类关注点
														
 
															     third_code: str
														
 
															     third_name: str
														
 
															+    third_seq: int  # 三级序号
														
 
															     third_focus: str
														
 
															     keywords: str = ""
														
@@ -35,6 +38,7 @@ class ClassifiedContent:
 
															     """分类结果"""
														
 
															     third_category_name: str  # 三级分类名称
														
 
															     third_category_code: str  # 三级分类代码
														
 
															+    third_seq: int  # 三级序号
														
 
															     start_line: int
														
 
															     end_line: int
														
 
															     content: str  # 原文内容
														
--- a/requirements.txt
+++ b/requirements.txt
--- a/views/construction_review/file_upload.py
+++ b/views/construction_review/file_upload.py
@@ -6,7 +6,11 @@ import ast
 
															 import traceback
														
 
															 import uuid
														
 
															 import time
														
 
															+import tempfile
														
 
															+import subprocess
														
 
															+import os
														
 
															 from datetime import datetime
														
 
															+from pathlib import Path
														
 
															 from pydantic import BaseModel, Field
														
 
															 from typing import Optional,List
														
@@ -20,6 +24,156 @@ from core.base.redis_duplicate_checker import RedisDuplicateChecker
 
															 from foundation.infrastructure.tracing import TraceContext, auto_trace
														
 
															+def _find_soffice_path() -> str:
														
 
															+    """
														
 
															+    查找 LibreOffice soffice 可执行文件路径
														
 
															+
														
 
															+    Returns:
														
 
															+        str: soffice 可执行文件路径
														
 
															+
														
 
															+    Raises:
														
 
															+        FileNotFoundError: 未找到 LibreOffice
														
 
															+    """
														
 
															+    import platform
														
 
															+
														
 
															+    # Linux/Docker 环境：直接使用 soffice
														
 
															+    if platform.system() != 'Windows':
														
 
															+        return 'soffice'
														
 
															+
														
 
															+    # Windows 环境：检测常见安装路径
														
 
															+    possible_paths = [
														
 
															+        r"C:\Program Files\LibreOffice\program\soffice.exe",
														
 
															+        r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
														
 
															+    ]
														
 
															+
														
 
															+    for path in possible_paths:
														
 
															+        if os.path.exists(path):
														
 
															+            logger.info(f"找到 LibreOffice: {path}")
														
 
															+            return path
														
 
															+
														
 
															+    raise FileNotFoundError(
														
 
															+        "LibreOffice 未安装。请从 https://www.libreoffice.org/download/ 下载安装，"
														
 
															+        "或确保 soffice.exe 在 PATH 中"
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+def convert_docx_to_pdf(docx_content: bytes, filename: str) -> tuple[bytes, str]:
														
 
															+    """
														
 
															+    将 docx/doc 文件内容转换为 PDF
														
 
															+
														
 
															+    Windows 开发环境: 优先使用 docx2pdf (Microsoft Word COM)，回退到 LibreOffice
														
 
															+    Linux/Docker 生产环境: 使用 LibreOffice (soffice)
														
 
															+
														
 
															+    Args:
														
 
															+        docx_content: docx/doc 文件的二进制内容
														
 
															+        filename: 原始文件名（用于生成新的 PDF 文件名）
														
 
															+
														
 
															+    Returns:
														
 
															+        tuple[bytes, str]: (PDF 文件内容, 新的 PDF 文件名)
														
 
															+
														
 
															+    Raises:
														
 
															+        Exception: 转换失败时抛出异常
														
 
															+    """
														
 
															+    import platform
														
 
															+
														
 
															+    # Windows 环境：优先尝试 docx2pdf (Microsoft Word COM)
														
 
															+    if platform.system() == 'Windows':
														
 
															+        try:
														
 
															+            from docx2pdf import convert
														
 
															+            return _convert_via_docx2pdf(docx_content, filename, convert)
														
 
															+        except ImportError:
														
 
															+            logger.info("docx2pdf 未安装，使用 LibreOffice")
														
 
															+        except Exception as e:
														
 
															+            logger.warning(f"docx2pdf 转换失败，回退到 LibreOffice: {str(e)}")
														
 
															+
														
 
															+    # Linux/Docker 或 Windows 回退：使用 LibreOffice
														
 
															+    return _convert_via_libreoffice(docx_content, filename)
														
 
															+
														
 
															+
														
 
															+def _convert_via_docx2pdf(docx_content: bytes, filename: str, convert_func) -> tuple[bytes, str]:
														
 
															+    """使用 docx2pdf (Microsoft Word COM) 转换"""
														
 
															+    with tempfile.TemporaryDirectory() as temp_dir:
														
 
															+        temp_dir_path = Path(temp_dir)
														
 
															+
														
 
															+        # 保存原始文件
														
 
															+        original_ext = Path(filename).suffix.lower()
														
 
															+        base_name = Path(filename).stem
														
 
															+        temp_input = temp_dir_path / f"input{original_ext}"
														
 
															+        temp_output = temp_dir_path / "output.pdf"
														
 
															+        temp_input.write_bytes(docx_content)
														
 
															+
														
 
															+        logger.info(f"使用 Microsoft Word 转换 {filename} 为 PDF...")
														
 
															+
														
 
															+        convert_func(str(temp_input), str(temp_output))
														
 
															+
														
 
															+        if not temp_output.exists():
														
 
															+            raise Exception("转换后未找到 PDF 文件")
														
 
															+
														
 
															+        pdf_content = temp_output.read_bytes()
														
 
															+        pdf_filename = f"{base_name}.pdf"
														
 
															+
														
 
															+        logger.info(f"成功转换 {filename} -> {pdf_filename}, PDF 大小: {len(pdf_content) / 1024:.2f} KB")
														
 
															+
														
 
															+        return pdf_content, pdf_filename
														
 
															+
														
 
															+
														
 
															+def _convert_via_libreoffice(docx_content: bytes, filename: str) -> tuple[bytes, str]:
														
 
															+    """使用 LibreOffice (soffice) 转换"""
														
 
															+    # 创建临时目录
														
 
															+    with tempfile.TemporaryDirectory() as temp_dir:
														
 
															+        temp_dir_path = Path(temp_dir)
														
 
															+
														
 
															+        # 保存原始文件到临时目录
														
 
															+        original_ext = Path(filename).suffix.lower()
														
 
															+        base_name = Path(filename).stem
														
 
															+        temp_input = temp_dir_path / f"input{original_ext}"
														
 
															+        temp_input.write_bytes(docx_content)
														
 
															+
														
 
															+        logger.info(f"使用 LibreOffice 转换 {filename} 为 PDF...")
														
 
															+
														
 
															+        # 查找 LibreOffice 路径
														
 
															+        try:
														
 
															+            soffice_path = _find_soffice_path()
														
 
															+        except FileNotFoundError as e:
														
 
															+            logger.error(str(e))
														
 
															+            raise Exception(str(e))
														
 
															+
														
 
															+        # 使用 LibreOffice 转换
														
 
															+        try:
														
 
															+            result = subprocess.run(
														
 
															+                [
														
 
															+                    soffice_path, '--headless', '--convert-to', 'pdf',
														
 
															+                    '--outdir', str(temp_dir_path),
														
 
															+                    str(temp_input)
														
 
															+                ],
														
 
															+                capture_output=True,
														
 
															+                text=True,
														
 
															+                timeout=120  # 2分钟超时
														
 
															+            )
														
 
															+
														
 
															+            if result.returncode != 0:
														
 
															+                logger.error(f"LibreOffice 转换失败: {result.stderr}")
														
 
															+                raise Exception(f"LibreOffice 转换失败: {result.stderr}")
														
 
															+
														
 
															+            # 查找生成的 PDF 文件
														
 
															+            pdf_files = list(temp_dir_path.glob("*.pdf"))
														
 
															+            if not pdf_files:
														
 
															+                raise Exception("转换后未找到 PDF 文件")
														
 
															+
														
 
															+            pdf_file = pdf_files[0]
														
 
															+            pdf_content = pdf_file.read_bytes()
														
 
															+            pdf_filename = f"{base_name}.pdf"
														
 
															+
														
 
															+            logger.info(f"成功转换 {filename} -> {pdf_filename}, PDF 大小: {len(pdf_content) / 1024:.2f} KB")
														
 
															+
														
 
															+            return pdf_content, pdf_filename
														
 
															+
														
 
															+        except subprocess.TimeoutExpired:
														
 
															+            raise Exception("LibreOffice 转换超时")
														
 
															+        except FileNotFoundError:
														
 
															+            raise Exception("LibreOffice 未安装或 soffice 命令不可用")
														
 
															+
														
 
															+
														
 
															 file_upload_router = APIRouter(prefix="/sgsc", tags=["前端接口"])
														
 
															 uploaded_files = {}
														
 
															 # 初始化工作流管理器
														
@@ -153,10 +307,27 @@ async def file_upload(
 
															         # 确定文件类型
														
 
															         file_extension = file[0].filename.split('.')[-1].lower() if '.' in file[0].filename else ''
														
 
															+        original_filename = file[0].filename  # 保存原始文件名
														
 
															+
														
 
															         if content.startswith(b'%PDF'):
														
 
															             file_type = 'pdf'
														
 
															         elif content.startswith(b'PK\x03\x04') and file_extension in ['docx', 'doc']:
														
 
															-            file_type = 'docx'
														
 
															+            # 检测到 docx/doc 文件，转换为 PDF
														
 
															+            logger.info(f"检测到 {file_extension} 文件，正在转换为 PDF...")
														
 
															+            try:
														
 
															+                pdf_content, pdf_filename = convert_docx_to_pdf(content, original_filename)
														
 
															+                # 更新文件内容和相关信息
														
 
															+                content = pdf_content
														
 
															+                original_filename = pdf_filename
														
 
															+                file_type = 'pdf'  # 标记为 PDF 类型，后续流程按 PDF 处理
														
 
															+                file_size = len(pdf_content)
														
 
															+                file_size_mb = round(file_size / (1024 * 1024), 2)
														
 
															+                # 重新生成 MD5（基于转换后的 PDF）
														
 
															+                file_id = md5.md5_id(content)
														
 
															+                logger.info(f"文件已转换为 PDF: {pdf_filename}, 大小: {file_size_mb} MB")
														
 
															+            except Exception as convert_error:
														
 
															+                logger.error(f"docx 转 PDF 失败: {str(convert_error)}")
														
 
															+                raise FileUploadErrors.internal_error(f"文档转换失败: {str(convert_error)}")
														
 
															         else:
														
 
															             file_type = 'unknown'
														
@@ -172,7 +343,7 @@ async def file_upload(
 
															                 'user_id': user,
														
 
															                 'file_type': file_type,
														
 
															                 'callback_task_id': callback_task_id,
														
 
															-                "file_name": file[0].filename,
														
 
															+                "file_name": original_filename,  # 使用转换后的文件名（docx 转 PDF 后会更新）
														
 
															                 "file_size": file_size_mb,
														
 
															                 'updated_at': created_at
														
 
															             }