před 2 týdny · 28eb485205
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 
				 FROM python:3.12-slim
			
 
				 
			
 
				-# 安装 OpenCV 系统依赖（更完整的列表）
			
 
				+# 安装 OpenCV 系统依赖及 LibreOffice（docx/doc 转 PDF）
			
 
				 RUN apt-get update && apt-get install -y \
			
 
				     # OpenCV 核心依赖
			
 
				     libgl1 \
			
@@ -26,6 +26,12 @@ RUN apt-get update && apt-get install -y \
 
				     # 其他可能需要的库
			
 
				     libfontconfig1 \
			
 
				     libfreetype6 \
			
 
				+    # LibreOffice（用于 docx/doc 转 PDF）
			
 
				+    libreoffice-writer \
			
 
				+    libreoffice-core \
			
 
				+    # 中文字体（PDF 转换中文支持）
			
 
				+    fonts-wqy-zenhei \
			
 
				+    --no-install-recommends \
			
 
				     && rm -rf /var/lib/apt/lists/*
			
 
				 
			
 
				 ENV DEBIAN_FRONTEND=noninteractive \
			
--- a/core/base/__init__.py
+++ b/core/base/__init__.py
@@ -1,27 +1,27 @@
 
				 """
			
 
				 文档分类切分库
			
 
				-支持PDF和Word文档的目录提取、智能分类和文本切分
			
 
				+支持PDF文档的目录提取、智能分类和文本切分
			
 
				 
			
 
				 主要功能：
			
 
				-1. 提取PDF/Word文档的目录结构
			
 
				+1. 提取PDF文档的目录结构
			
 
				 2. 识别和校验目录的层级关系
			
 
				 3. 基于二级目录关键词匹配对一级目录进行智能分类
			
 
				 4. 按目录层级和字符数智能切分文本
			
 
				 5. 保存分类结果到多种格式
			
 
				 
			
 
				 使用示例（当前推荐直接使用业务层封装的 DocumentProcessor，而不是底层分类器类）。
			
 
				+
			
 
				+注意: DOCX/DOC 文件应在上传层转换为 PDF，本模块不再直接处理 DOCX
			
 
				 """
			
 
				 
			
 
				-__version__ = "2.0.0"
			
 
				+__version__ = "2.1.0"
			
 
				 __author__ = "Your Name"
			
 
				 
			
 
				 
			
 
				 from core.construction_review.component.doc_worker.interfaces import TOCExtractor, TextSplitter
			
 
				 from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
			
 
				 from core.construction_review.component.doc_worker.pdf_worker.toc_extractor import PdfTOCExtractor
			
 
				-from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
			
 
				 from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
			
 
				-from core.construction_review.component.doc_worker.docx_worker.text_splitter import DocxTextSplitter
			
 
				 
			
 
				 
			
 
				 __all__ = [
			
@@ -29,8 +29,6 @@ __all__ = [
 
				     'TextSplitter',
			
 
				     'HierarchyClassifier',
			
 
				     'PdfTOCExtractor',
			
 
				-    'DocxTOCExtractor',
			
 
				     'PdfTextSplitter',
			
 
				-    'DocxTextSplitter',
			
 
				 ]
			
 
				 
			
--- a/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
+++ b/core/construction_review/component/doc_worker/config/StandardCategoryTable.csv
@@ -1,4 +1,4 @@
 
				-first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus,keywords
			
 
				+first_code,first_name,second_code,second_name,second_focus,third_code,third_name,third_focus,keywords
			
 
				 basis,编制依据,LawsAndRegulations,法律法规,NULL,NationalLawsAndRegulations,国家政府发布的法律法规与规章制度,国家级、法律、法规、规章、强制力、普遍适用、基础框架、顶层设计、行业准则、合规性、统一标准、权威性、强制性条文、基本要求。,国家法律;法规;规章;强制性条文;国务院令;住房城乡建设部;中华人民共和国
			
 
				 basis,编制依据,LawsAndRegulations,法律法规,NULL,ProvincialLawsAndRegulationsOfProjectLocation,工程所在地省级政府发布的法律法规与规章制度,地方性、区域性、细化补充、因地制宜、执行细则、地方特色、适应性要求、属地管理、动态调整、配套政策、本地化实施。,省级;地方法规;省政府;地方规章;属地管理;四川省;省人民政府
			
 
				 basis,编制依据,StandardsAndSpecifications,标准规范,NULL,IndustryStandards,行业标准,需符合国家/行业强制或推荐性标准（如GB/T、JTG等）、时效性强（需跟踪最新版）、覆盖全生命周期（设计→施工→运维）、是定义工程项目的最低技术要求、质量验收准则、安全红线。,GB/T;JTG;CJJ;行业标准;国家标准;推荐性标准;GB 5;TB;HJ;DL
			
@@ -56,7 +56,6 @@ technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规
 
				 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,TemporaryWaterAndElectricityConsumption,临时水电用量,需计算施工期间的用水、用电量（如“临时用水管径DN100”“临时用电容量500kW”）、用于临时设施的设计；,临时用水量;临时用电量;用水量;用电量;水电用量
			
 
				 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,TheSiteIsFlat,场地平整,需明确平整的范围、标高（如“平整场地至设计标高±0.000”“压实度达到90%”）、是施工场地准备的基础；,场地平整;整平场地;标高;压实度;平整
			
 
				 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,Staffing,人员配置,需列出各岗位的人员数量（如“项目经理1名”“施工员2名”“钢筋工10名”）、是劳动力管理的核心；,人员配置;岗位人员;项目经理;施工员;人员配备;人员分工
			
 
				-technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,EquipmentEntry,设备进场,需明确设备的进场时间、运输方式（如“塔式起重机进场时间2026年3月1日”“采用平板车运输”）、是设备准备的关键；,设备进场;进场时间;进场方式;进场日期;机械进场
			
 
				 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,SafetyProtectionFacilities,安全防护措施,需列出现场的安全设施（如“安全网”“防护栏杆”“消防栓”）、是安全保障的基础；,安全防护;安全网;防护栏杆;消防设施;安全设施;防护措施
			
 
				 technology,施工工艺技术,PrepWork,施工准备,名称类、数值类、规格类、数值单位类、岗位名称类、时间日期类、工程设备类,PersonnelAccess,人员上下通道,需明确通道的形式、位置（如“楼梯间通道”“脚手架斜道”）、是人员通行的安全保障。,人员通道;上下通道;楼梯通道;斜道;人员上下;通道布置
			
 
				 technology,施工工艺技术,Process,工艺流程,工序专业名称类、工程名称类、数值类、数值单位类,ConstructionProcess,施工工序,需列出工程的主要工序（如“地基处理→基础浇筑→主体结构→装饰装修”）、是工艺流程的核心；,施工工序;主要工序;工序流程;施工顺序;工艺步骤
			
@@ -101,10 +100,10 @@ quality,质量保证措施,Excellence,工程创优规划,工程创优总体计
 
				 quality,质量保证措施,Excellence,工程创优规划,工程创优总体计划、技术准备（BIM/新技术应用）、过程控制（关键工序精品打造）、细部处理（节点优化）、精品工程创建、新技术推广（四新技术）、申报资料编制、工程资料归档、创优考核机制,NewTechnologyPromotion,新技术推广,需应用“四新技术”（新技术、新材料、新工艺、新设备）、提升创优的技术含量；,四新技术;新技术推广;新工艺;新材料;新设备;技术创新应用
			
 
				 quality,质量保证措施,Excellence,工程创优规划,工程创优总体计划、技术准备（BIM/新技术应用）、过程控制（关键工序精品打造）、细部处理（节点优化）、精品工程创建、新技术推广（四新技术）、申报资料编制、工程资料归档、创优考核机制,PreparationOfApplicationMaterials,申报资料编制,需整理创优所需的资料（如工程质量报告、技术创新成果）、是创优申报的核心材料；,申报资料;创优申报;工程质量报告;申报材料
			
 
				 quality,质量保证措施,Excellence,工程创优规划,工程创优总体计划、技术准备（BIM/新技术应用）、过程控制（关键工序精品打造）、细部处理（节点优化）、精品工程创建、新技术推广（四新技术）、申报资料编制、工程资料归档、创优考核机制,EngineeringDataArchiving,工程资料归档,需确保资料真实、完整、符合创优评审要求。,工程资料归档;档案管理;竣工资料;资料归档
			
 
				-quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料进场检验（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,RawMaterialInspection,原材料进场检验,需执行“三证一检”（合格证、质检报告、生产许可证+进场复检）、确保材料质量；,原材料进场;三证一检;材料检验;复检报告;进场材料质量
			
 
				-quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料进场检验（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,PhysicalProjectQualityAcceptance,实体工程质量验收,需按分项（如“钢筋绑扎”）、分部工程（如“基础工程”）进行验收、符合规范要求；,实体验收;分项验收;分部验收;实体工程验收;工程质量验收
			
 
				-quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料进场检验（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,PreventionAndControlOfCommonQualityDefectsInProcesses,工序质量通病防治,需针对常见问题（如“墙面空鼓”“屋面渗漏”）制定专项措施（如“抹灰前基层凿毛”“防水附加层施工”）、减少质量缺陷；,质量通病;空鼓;渗漏;裂缝;蜂窝麻面;防治措施;通病防治
			
 
				-quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料进场检验（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,SeasonalConstructionQualityAssuranceMeasures,季节性施工质量保证措施,需针对冬期（混凝土保温）、雨期（防水加强）、高温（混凝土保湿）制定专项措施、确保施工质量；,季节性施工;冬期施工;雨期施工;高温施工;夏季施工;冬季混凝土
			
 
				+quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料检查验收（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,RawMaterialInspection,原材料检查验收,需执行“三证一检”（合格证、质检报告、生产许可证+进场复检）、确保材料质量；,原材料进场;三证一检;材料检验;复检报告;进场材料质量
			
 
				+quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料检查验收（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,PhysicalProjectQualityAcceptance,实体工程质量验收,需按分项（如“钢筋绑扎”）、分部工程（如“基础工程”）进行验收、符合规范要求；,实体验收;分项验收;分部验收;实体工程验收;工程质量验收
			
 
				+quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料检查验收（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,PreventionAndControlOfCommonQualityDefectsInProcesses,工序质量通病防治,需针对常见问题（如“墙面空鼓”“屋面渗漏”）制定专项措施（如“抹灰前基层凿毛”“防水附加层施工”）、减少质量缺陷；,质量通病;空鼓;渗漏;裂缝;蜂窝麻面;防治措施;通病防治
			
 
				+quality,质量保证措施,QualityControl,质量控制程序与具体措施,原材料检查验收（三证一检）、实体工程质量验收（分项/分部工程验收）、质量通病防治（墙面空鼓/屋面渗漏）、季节性施工质量控制（冬期混凝土保温/雨期防水）、工序质量控制点、质量检查程序（自检/互检/专检）、质量问题整改（闭环管理）,SeasonalConstructionQualityAssuranceMeasures,季节性施工质量保证措施,需针对冬期（混凝土保温）、雨期（防水加强）、高温（混凝土保湿）制定专项措施、确保施工质量；,季节性施工;冬期施工;雨期施工;高温施工;夏季施工;冬季混凝土
			
 
				 environment,环境保证措施,EnvSystem,环境保证体系,环境保证体系框图、公司标准体系引用,BlockDiagramOfEnvironmentalAssuranceSystem,环境保证体系框图,环境保证体系的视觉化呈现、需明确体系的核心要素（如组织机构、制度流程、资源保障）及逻辑关系、是公司标准体系的具象化载体；,环境保证体系;环境管理体系框图;环境保证体系框图
			
 
				 environment,环境保证措施,EnvSystem,环境保证体系,环境保证体系框图、公司标准体系引用,CompanyStandardSystemReference,公司标准体系引用,应引用公司标准体系框图、强调环境保证体系需承接公司现有标准（如《公司环境管理体系手册》《公司环境保护管理办法》）、确保体系的一致性与延续性；,环境管理体系;环境保护管理办法;公司环境标准;环境体系引用
			
 
				 environment,环境保证措施,EnvOrg,环境保护组织机构,环境保护组织架构、管理人员姓名、管理人员职务、管理人员职责、环境管理岗位责任、责任考核机制、环境管理职责分工、环境管理人员资质、环境管理沟通机制,EnvironmentalAssuranceSystemFramework,环境保护组织架构,包含管理人员姓名、职务、职责、环境管理的责任主体、基于项目经理为组长的工作领导小组、小组中包括项目经理、项目副经理、项目总工、工程部门、质检部门、安全环保部门、专业分包单位（协作队伍）项目负责人和项目技术负责人等、需明确机构的层级（如公司级、项目级、班组级）及组成部门（如环境部、工程部、技术部）、形成“横向到边、纵向到底”的管理网络；,环境保护组织;环境管理机构;环境管理组织架构;环境领导小组
			
--- a/core/construction_review/component/doc_worker/docx_worker/__init__.py
+++ b/core/construction_review/component/doc_worker/docx_worker/__init__.py
@@ -1,17 +0,0 @@
 
				-"""
			
 
				-DOCX 文档处理模块
			
 
				-
			
 
				-提供 DOCX 文件的目录提取、全文提取、文本切分等功能。
			
 
				-"""
			
 
				-
			
 
				-from .pipeline import DocxPipeline
			
 
				-from .toc_extractor import DocxTOCExtractor
			
 
				-from .full_text_extractor import DocxFullTextExtractor
			
 
				-from .text_splitter import DocxTextSplitter
			
 
				-
			
 
				-__all__ = [
			
 
				-    "DocxPipeline",
			
 
				-    "DocxTOCExtractor",
			
 
				-    "DocxFullTextExtractor",
			
 
				-    "DocxTextSplitter",
			
 
				-]
			
--- a/core/construction_review/component/doc_worker/docx_worker/cli.py
+++ b/core/construction_review/component/doc_worker/docx_worker/cli.py
@@ -1,118 +0,0 @@
 
				-"""
			
 
				-DOCX 处理命令行接口
			
 
				-
			
 
				-用法示例：
			
 
				-  python -m file_parse.docx_worker.cli input.docx
			
 
				-  python -m file_parse.docx_worker.cli input.docx -l 1 --max-size 3000 --min-size 50
			
 
				-  python -m file_parse.docx_worker.cli input.docx -o ./output
			
 
				-"""
			
 
				-
			
 
				-import argparse
			
 
				-import json
			
 
				-import sys
			
 
				-from datetime import datetime
			
 
				-from pathlib import Path
			
 
				-
			
 
				-from ..interfaces import DocumentSource
			
 
				-from .pipeline import DocxPipeline
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    parser = argparse.ArgumentParser(description="DOCX 文档处理工具")
			
 
				-    parser.add_argument("docx_path", help="输入 DOCX 文件路径")
			
 
				-    parser.add_argument(
			
 
				-        "-l", "--level",
			
 
				-        type=int,
			
 
				-        help="目标层级（默认从配置读取）"
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--max-size",
			
 
				-        type=int,
			
 
				-        help="最大块大小（默认从配置读取）"
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "--min-size",
			
 
				-        type=int,
			
 
				-        help="最小块大小（默认从配置读取）"
			
 
				-    )
			
 
				-    parser.add_argument(
			
 
				-        "-o", "--output",
			
 
				-        help="输出目录（默认为 ./output）"
			
 
				-    )
			
 
				-    
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    # 检查文件是否存在
			
 
				-    docx_path = Path(args.docx_path)
			
 
				-    if not docx_path.exists():
			
 
				-        print(f"错误：文件不存在 -> {docx_path}", file=sys.stderr)
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    # 创建输出目录
			
 
				-    output_dir = Path(args.output) if args.output else Path("./output")
			
 
				-    output_dir.mkdir(parents=True, exist_ok=True)
			
 
				-
			
 
				-    # 创建文档源
			
 
				-    source = DocumentSource(path=docx_path, file_type="docx")
			
 
				-
			
 
				-    # 运行处理流程
			
 
				-    try:
			
 
				-        pipeline = DocxPipeline()
			
 
				-        result = pipeline.run(
			
 
				-            source,
			
 
				-            target_level=args.level,
			
 
				-            max_chunk_size=args.max_size,
			
 
				-            min_chunk_size=args.min_size,
			
 
				-        )
			
 
				-    except Exception as e:
			
 
				-        print(f"处理失败：{e}", file=sys.stderr)
			
 
				-        import traceback
			
 
				-        traceback.print_exc()
			
 
				-        sys.exit(1)
			
 
				-
			
 
				-    # 生成输出文件名
			
 
				-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				-    base_name = docx_path.stem
			
 
				-    output_file = output_dir / f"{base_name}_完整结果_{timestamp}.json"
			
 
				-
			
 
				-    # 构建完整输出结构
			
 
				-    output_data = {
			
 
				-        "source_file": str(docx_path.absolute()),
			
 
				-        "process_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
			
 
				-        "toc_summary": {
			
 
				-            "total_items": result["toc_info"]["toc_count"],
			
 
				-            "toc_pages": result["toc_info"]["toc_pages"],
			
 
				-        },
			
 
				-        "complete_toc_list": [
			
 
				-            {
			
 
				-                "index": i + 1,
			
 
				-                "title": item["title"],
			
 
				-                "page": item["page"],
			
 
				-                "level": item["level"],
			
 
				-                "original": item["original"],
			
 
				-            }
			
 
				-            for i, item in enumerate(result["toc_info"]["toc_items"])
			
 
				-        ],
			
 
				-        "classification_summary": {
			
 
				-            "target_level": result["meta"]["target_level"],
			
 
				-            "total_count": result["classification"]["total_count"],
			
 
				-            "categories": result["classification"].get("category_stats", {}),
			
 
				-        },
			
 
				-        "classified_items": result["classification"]["items"],
			
 
				-        "chunks": result["chunks"],
			
 
				-        "meta": result["meta"],
			
 
				-    }
			
 
				-
			
 
				-    # 写入文件
			
 
				-    with output_file.open("w", encoding="utf-8") as f:
			
 
				-        json.dump(output_data, f, ensure_ascii=False, indent=2)
			
 
				-
			
 
				-    print(f"\n处理完成！")
			
 
				-    print(f"  输出文件: {output_file}")
			
 
				-    print(f"  目录项数: {result['toc_info']['toc_count']}")
			
 
				-    print(f"  分类项数: {result['classification']['total_count']}")
			
 
				-    print(f"  文本块数: {len(result['chunks'])}")
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/core/construction_review/component/doc_worker/docx_worker/full_text_extractor.py
+++ b/core/construction_review/component/doc_worker/docx_worker/full_text_extractor.py
@@ -1,110 +0,0 @@
 
				-"""
			
 
				-DOCX 全文提取实现
			
 
				-
			
 
				-提取 DOCX 文档的全文内容，按段落组织，模拟分页。
			
 
				-"""
			
 
				-
			
 
				-from __future__ import annotations
			
 
				-
			
 
				-import re
			
 
				-from io import BytesIO
			
 
				-from pathlib import Path
			
 
				-from typing import Any, Dict, List
			
 
				-
			
 
				-from docx import Document
			
 
				-
			
 
				-from ..interfaces import FullTextExtractor, DocumentSource
			
 
				-
			
 
				-
			
 
				-class DocxFullTextExtractor(FullTextExtractor):
			
 
				-    """DOCX 全文提取器"""
			
 
				-
			
 
				-    def __init__(self, paragraphs_per_page: int = 30):
			
 
				-        """
			
 
				-        初始化
			
 
				-        
			
 
				-        Args:
			
 
				-            paragraphs_per_page: 每页段落数（用于模拟分页）
			
 
				-        """
			
 
				-        self.paragraphs_per_page = paragraphs_per_page
			
 
				-
			
 
				-    def extract_full_text(self, source: DocumentSource) -> List[Dict[str, Any]]:
			
 
				-        """
			
 
				-        提取 DOCX 文档的全文内容
			
 
				-        
			
 
				-        返回结构：
			
 
				-        [
			
 
				-            {
			
 
				-                "page_num": int,
			
 
				-                "text": str,
			
 
				-                "start_pos": int,
			
 
				-                "end_pos": int,
			
 
				-                "source_file": str,
			
 
				-            },
			
 
				-            ...
			
 
				-        ]
			
 
				-        """
			
 
				-        # 加载文档
			
 
				-        if source.path:
			
 
				-            doc = Document(source.path)
			
 
				-            source_file = str(source.path)
			
 
				-        elif source.content:
			
 
				-            doc = Document(BytesIO(source.content))
			
 
				-            source_file = "bytes_stream"
			
 
				-        else:
			
 
				-            raise ValueError("DocumentSource 必须提供 path 或 content")
			
 
				-
			
 
				-        # 按照文档中的实际顺序提取段落和表格
			
 
				-        # 创建段落和表格的元素到对象的映射
			
 
				-        para_map = {para._element: para for para in doc.paragraphs}
			
 
				-        table_map = {table._element: table for table in doc.tables}
			
 
				-        
			
 
				-        # 按照文档中的顺序遍历所有元素
			
 
				-        all_elements = []
			
 
				-        for element in doc.element.body:
			
 
				-            if element in para_map:
			
 
				-                # 段落元素
			
 
				-                para = para_map[element]
			
 
				-                text = para.text
			
 
				-                # 过滤目录行：标题\t页码（页码部分支持带修饰符号）
			
 
				-                # 匹配从开头开始，包含制表符且末尾有数字的模式（目录行特征）
			
 
				-                if text and not re.match(r"^.+\t+.*?\d+.*?\s*$", text):
			
 
				-                    all_elements.append(text)
			
 
				-            elif element in table_map:
			
 
				-                # 表格元素
			
 
				-                table = table_map[element]
			
 
				-                table_text = self._extract_table_text(table)
			
 
				-                all_elements.append(table_text)
			
 
				-
			
 
				-        # 模拟分页：每 N 个元素作为一页
			
 
				-        pages_content = []
			
 
				-        current_pos = 0
			
 
				-        
			
 
				-        # 正则表达式：匹配 [表格开始]...任意内容...[表格结束] 模式
			
 
				-        table_placeholder_pattern = re.compile(
			
 
				-            r'\n?\[表格开始\]\n.*?\n\[表格结束\]\n?',
			
 
				-            re.DOTALL
			
 
				-        )
			
 
				-        
			
 
				-        for page_num in range(0, len(all_elements), self.paragraphs_per_page):
			
 
				-            page_elements = all_elements[page_num:page_num + self.paragraphs_per_page]
			
 
				-            page_text = "\n".join(page_elements)
			
 
				-            
			
 
				-            # 将任何可能存在的 [表格开始]...表格内容...[表格结束] 替换为占位符
			
 
				-            page_text = table_placeholder_pattern.sub('\n<表格></表格>\n', page_text)
			
 
				-            
			
 
				-            pages_content.append({
			
 
				-                "page_num": page_num // self.paragraphs_per_page + 1,
			
 
				-                "text": page_text,
			
 
				-                "start_pos": current_pos,
			
 
				-                "end_pos": current_pos + len(page_text),
			
 
				-                "source_file": source_file,
			
 
				-            })
			
 
				-            
			
 
				-            current_pos += len(page_text)
			
 
				-
			
 
				-        return pages_content
			
 
				-
			
 
				-    def _extract_table_text(self, table) -> str:
			
 
				-        """提取表格占位符，不提取实际内容"""
			
 
				-        return "\n<表格></表格>\n"
			
--- a/core/construction_review/component/doc_worker/docx_worker/pipeline.py
+++ b/core/construction_review/component/doc_worker/docx_worker/pipeline.py
@@ -1,106 +0,0 @@
 
				-"""
			
 
				-DOCX 文档处理流程
			
 
				-
			
 
				-整合目录提取、分类、全文提取、文本切分等步骤。
			
 
				-"""
			
 
				-
			
 
				-from __future__ import annotations
			
 
				-
			
 
				-from pathlib import Path
			
 
				-from typing import Any, Dict, Optional
			
 
				-
			
 
				-from ..interfaces import DocumentPipeline, DocumentSource
			
 
				-from ..config.provider import default_config_provider
			
 
				-from ..classification.hierarchy_classifier import HierarchyClassifier
			
 
				-
			
 
				-from .toc_extractor import DocxTOCExtractor
			
 
				-from .full_text_extractor import DocxFullTextExtractor
			
 
				-from .text_splitter import DocxTextSplitter
			
 
				-
			
 
				-
			
 
				-class DocxPipeline(DocumentPipeline):
			
 
				-    """DOCX 文档处理流水线"""
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self._cfg = default_config_provider
			
 
				-        self._toc_extractor = DocxTOCExtractor()
			
 
				-        self._full_text_extractor = DocxFullTextExtractor(
			
 
				-            paragraphs_per_page=int(self._cfg.get("toc_extraction.paragraphs_per_page", 30))
			
 
				-        )
			
 
				-        self._text_splitter = DocxTextSplitter()
			
 
				-        self._classifier = HierarchyClassifier()
			
 
				-
			
 
				-    def run(
			
 
				-        self,
			
 
				-        source: DocumentSource,
			
 
				-        target_level: Optional[int] = None,
			
 
				-        max_chunk_size: Optional[int] = None,
			
 
				-        min_chunk_size: Optional[int] = None,
			
 
				-    ) -> Dict[str, Any]:
			
 
				-        """
			
 
				-        运行完整流程
			
 
				-        
			
 
				-        返回：
			
 
				-        {
			
 
				-            "toc_info": {...},
			
 
				-            "classification": {...},
			
 
				-            "chunks": [...],
			
 
				-            "meta": {...},
			
 
				-        }
			
 
				-        """
			
 
				-        # 从配置获取默认值
			
 
				-        if target_level is None:
			
 
				-            target_level = int(self._cfg.get("text_splitting.target_level", 1))
			
 
				-        if max_chunk_size is None:
			
 
				-            max_chunk_size = int(self._cfg.get("text_splitting.max_chunk_size", 3000))
			
 
				-        if min_chunk_size is None:
			
 
				-            min_chunk_size = int(self._cfg.get("text_splitting.min_chunk_size", 50))
			
 
				-
			
 
				-        print(f"开始处理 DOCX 文档...")
			
 
				-        print(f"  目标层级: {target_level}")
			
 
				-        print(f"  最大块大小: {max_chunk_size}")
			
 
				-        print(f"  最小块大小: {min_chunk_size}")
			
 
				-
			
 
				-        # 步骤1: 提取目录
			
 
				-        print("\n步骤1: 提取目录...")
			
 
				-        toc_info = self._toc_extractor.extract_toc(source)
			
 
				-        print(f"  提取到 {toc_info['toc_count']} 个目录项")
			
 
				-
			
 
				-        # 步骤2: 分类目录项
			
 
				-        print("\n步骤2: 分类目录项...")
			
 
				-        classification = self._classifier.classify(toc_info["toc_items"], target_level)
			
 
				-        print(f"  分类完成，共 {classification['total_count']} 个目标层级项")
			
 
				-
			
 
				-        # 步骤3: 提取全文
			
 
				-        print("\n步骤3: 提取全文...")
			
 
				-        pages_content = self._full_text_extractor.extract_full_text(source)
			
 
				-        print(f"  提取到 {len(pages_content)} 页内容")
			
 
				-
			
 
				-        # 步骤4: 切分文本
			
 
				-        print("\n步骤4: 切分文本...")
			
 
				-        chunks = self._text_splitter.split_by_hierarchy(
			
 
				-            classification["items"],
			
 
				-            pages_content,
			
 
				-            toc_info,
			
 
				-            target_level,
			
 
				-            max_chunk_size,
			
 
				-            min_chunk_size,
			
 
				-        )
			
 
				-        print(f"  切分完成，共 {len(chunks)} 个块")
			
 
				-
			
 
				-        # 填充文件名
			
 
				-        file_name = Path(source.path).name if source.path else "unknown.docx"
			
 
				-        for chunk in chunks:
			
 
				-            chunk["file_name"] = file_name
			
 
				-
			
 
				-        return {
			
 
				-            "toc_info": toc_info,
			
 
				-            "classification": classification,
			
 
				-            "chunks": chunks,
			
 
				-            "meta": {
			
 
				-                "target_level": target_level,
			
 
				-                "max_chunk_size": max_chunk_size,
			
 
				-                "min_chunk_size": min_chunk_size,
			
 
				-                "file_type": "docx",
			
 
				-            },
			
 
				-        }
			
--- a/core/construction_review/component/doc_worker/docx_worker/text_splitter.py
+++ b/core/construction_review/component/doc_worker/docx_worker/text_splitter.py
@@ -1,327 +0,0 @@
 
				-"""
			
 
				-DOCX 文本切分实现
			
 
				-
			
 
				-复刻 PDF 处理的切分逻辑：
			
 
				-1. 跳过目录页，只在正文中定位章节标题
			
 
				-2. 按最低目录层级进行切分，形成章节块
			
 
				-3. 对超过最大字符数的块按段落-句子进行再次切分，保持语义完整性
			
 
				-"""
			
 
				-
			
 
				-from __future__ import annotations
			
 
				-
			
 
				-from typing import Any, Dict, List
			
 
				-
			
 
				-from ..config.provider import default_config_provider
			
 
				-from ..interfaces import TextSplitter
			
 
				-from ..utils.title_matcher import TitleMatcher
			
 
				-from ..utils.text_split_support import HierarchicalChunkMixin
			
 
				-
			
 
				-
			
 
				-class DocxTextSplitter(TextSplitter, HierarchicalChunkMixin):
			
 
				-    """按目录层级对 DOCX 正文进行智能分块的实现"""
			
 
				-
			
 
				-    def __init__(self) -> None:
			
 
				-        self._cfg = default_config_provider
			
 
				-        self._title_matcher = TitleMatcher()
			
 
				-
			
 
				-    def split_by_hierarchy(
			
 
				-        self,
			
 
				-        classification_items: List[Dict[str, Any]],
			
 
				-        pages_content: List[Dict[str, Any]],
			
 
				-        toc_info: Dict[str, Any],
			
 
				-        target_level: int,
			
 
				-        max_chunk_size: int,
			
 
				-        min_chunk_size: int,
			
 
				-    ) -> List[Dict[str, Any]]:
			
 
				-        """
			
 
				-        按目录层级和字符数智能切分文本
			
 
				-        
			
 
				-        逻辑与 PDF 处理完全一致
			
 
				-        """
			
 
				-        toc_pages = toc_info.get("toc_pages", []) or []
			
 
				-        all_toc_items = toc_info.get("toc_items", [])
			
 
				-        
			
 
				-        # 使用完整全文
			
 
				-        full_text = "".join(p.get("text", "") for p in pages_content)
			
 
				-
			
 
				-        print(f"  正在定位{len(classification_items)}个已分类的标题...")
			
 
				-        print(f"  目录所在页: {toc_pages}")
			
 
				-
			
 
				-        # 步骤1: 在正文中定位已分类的标题（跳过目录页）
			
 
				-        located = self._title_matcher.find_title_positions(
			
 
				-            classification_items, full_text, pages_content, toc_pages
			
 
				-        )
			
 
				-        
			
 
				-        # 只保留成功定位的标题
			
 
				-        found_titles = [t for t in located if t["found"]]
			
 
				-        if not found_titles:
			
 
				-            print(f"  错误: 未能在正文中定位任何标题")
			
 
				-            return []
			
 
				-
			
 
				-        print(f"  成功定位 {len(found_titles)}/{len(classification_items)} 个标题")
			
 
				-        
			
 
				-        # 按位置排序
			
 
				-        found_titles.sort(key=lambda x: x["position"])
			
 
				-
			
 
				-        # 步骤2: 构建一级目录标题到分类信息的映射
			
 
				-        chapter_classification_map: Dict[str, Dict[str, Any]] = {}
			
 
				-        for item in classification_items:
			
 
				-            if item.get("level") == 1:
			
 
				-                chapter_title = item.get("title", "")
			
 
				-                chapter_classification_map[chapter_title] = {
			
 
				-                    "category": item.get("category", ""),
			
 
				-                    "category_code": item.get("category_code", "other"),
			
 
				-                    "page": item.get("page", ""),
			
 
				-                    "level": item.get("level", 1),
			
 
				-                }
			
 
				-
			
 
				-        # 步骤3: 为每个找到的标题构建完整的层级路径
			
 
				-        for title_info in found_titles:
			
 
				-            hierarchy_path = self._build_hierarchy_path(
			
 
				-                title_info["title"], all_toc_items, target_level
			
 
				-            )
			
 
				-            title_info["hierarchy_path"] = hierarchy_path
			
 
				-
			
 
				-        # 步骤4: 按目录层级处理每个标题块
			
 
				-        all_chunks: List[Dict[str, Any]] = []
			
 
				-        
			
 
				-        for i, title_info in enumerate(found_titles):
			
 
				-            start_pos = title_info["position"]
			
 
				-            
			
 
				-            # 确定正文块的结束位置（下一个同级标题的位置）
			
 
				-            if i + 1 < len(found_titles):
			
 
				-                end_pos = found_titles[i + 1]["position"]
			
 
				-            else:
			
 
				-                end_pos = len(full_text)
			
 
				-            
			
 
				-            # 提取正文块
			
 
				-            content_block = full_text[start_pos:end_pos]
			
 
				-            
			
 
				-            # 在正文块中查找子标题（按最低层级切分）
			
 
				-            sub_chunks = self._split_by_sub_titles(
			
 
				-                content_block,
			
 
				-                all_toc_items,
			
 
				-                title_info,
			
 
				-                target_level,
			
 
				-                max_chunk_size,
			
 
				-                min_chunk_size,
			
 
				-            )
			
 
				-            
			
 
				-            # 为每个子块添加元数据
			
 
				-            for j, sub_chunk in enumerate(sub_chunks, 1):
			
 
				-                chunk_data = self._build_chunk_metadata(
			
 
				-                    sub_chunk, title_info, start_pos, pages_content, i, j, chapter_classification_map
			
 
				-                )
			
 
				-                all_chunks.append(chunk_data)
			
 
				-
			
 
				-        # 步骤4: 生成最终的chunk_id和serial_number
			
 
				-        final_chunks = self._finalize_chunk_ids(all_chunks)
			
 
				-
			
 
				-        print(f"  初始切分: {len(all_chunks)} 个块")
			
 
				-        print(f"  最终块数: {len(final_chunks)} 个块")
			
 
				-
			
 
				-        return final_chunks
			
 
				-
			
 
				-    def _split_by_sub_titles(
			
 
				-        self,
			
 
				-        content_block: str,
			
 
				-        all_toc_items: List[Dict[str, Any]],
			
 
				-        parent_title_info: Dict[str, Any],
			
 
				-        target_level: int,
			
 
				-        max_chunk_size: int,
			
 
				-        min_chunk_size: int,
			
 
				-    ) -> List[Dict[str, Any]]:
			
 
				-        """
			
 
				-        在正文块中按子标题进行切分（按照toc_items的顺序和层级关系）
			
 
				-        
			
 
				-        核心逻辑：
			
 
				-        1. 查找所有层级的子标题（不限于直接子标题）
			
 
				-        2. 按位置排序后，两个相邻子标题之间的内容作为一个块
			
 
				-        3. 只有当块超过 max_chunk_size 时才按句子切分
			
 
				-        """
			
 
				-        # 找到父标题在toc_items中的位置
			
 
				-        parent_title = parent_title_info["title"]
			
 
				-        parent_idx = -1
			
 
				-        parent_level = target_level
			
 
				-        
			
 
				-        for idx, toc_item in enumerate(all_toc_items):
			
 
				-            if toc_item["title"] == parent_title:
			
 
				-                parent_idx = idx
			
 
				-                parent_level = toc_item.get("level", target_level)
			
 
				-                break
			
 
				-
			
 
				-        if parent_idx < 0:
			
 
				-            # 如果找不到父标题，将整个正文块作为一个块
			
 
				-            if len(content_block) > max_chunk_size:
			
 
				-                return self._split_large_chunk(content_block, max_chunk_size, parent_title, [])
			
 
				-            else:
			
 
				-                return [
			
 
				-                    {
			
 
				-                        "content": content_block,
			
 
				-                        "relative_start": 0,
			
 
				-                        "sub_title": "",
			
 
				-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				-                    }
			
 
				-                ]
			
 
				-
			
 
				-        # 找到下一个同级或更高级标题的位置（确定父标题的范围）
			
 
				-        next_sibling_idx = len(all_toc_items)
			
 
				-        for idx in range(parent_idx + 1, len(all_toc_items)):
			
 
				-            item = all_toc_items[idx]
			
 
				-            if item.get("level", 1) <= parent_level:
			
 
				-                next_sibling_idx = idx
			
 
				-                break
			
 
				-
			
 
				-        # 查找所有子标题（所有 level > parent_level 的标题）
			
 
				-        # 这是关键：不限于直接子标题，而是所有更深层级的标题
			
 
				-        all_sub_titles = []
			
 
				-        fuzzy_threshold = float(self._cfg.get("text_splitting.fuzzy_threshold", 0.8))
			
 
				-
			
 
				-        for idx in range(parent_idx + 1, next_sibling_idx):
			
 
				-            toc_item = all_toc_items[idx]
			
 
				-            item_level = toc_item.get("level", 1)
			
 
				-            
			
 
				-            # 查找所有更深层级的子标题
			
 
				-            if item_level > parent_level:
			
 
				-                # 在正文块中查找这个子标题
			
 
				-                pos = self._find_title_in_block(
			
 
				-                    toc_item["title"], content_block, fuzzy_threshold
			
 
				-                )
			
 
				-                if pos >= 0:
			
 
				-                    # 调试：显示找到的标题及其周围内容
			
 
				-                    context_start = max(0, pos - 20)
			
 
				-                    context_end = min(len(content_block), pos + len(toc_item["title"]) + 50)
			
 
				-                    context = content_block[context_start:context_end].replace("\n", " ")
			
 
				-                    print(f"        找到子标题: {toc_item['title']} (level={item_level}), 位置={pos}, 上下文: ...{context}...")
			
 
				-                    
			
 
				-                    all_sub_titles.append(
			
 
				-                        {
			
 
				-                            "title": toc_item["title"],
			
 
				-                            "level": toc_item["level"],
			
 
				-                            "position": pos,
			
 
				-                            "toc_index": idx,
			
 
				-                            "toc_item": toc_item,
			
 
				-                        }
			
 
				-                    )
			
 
				-
			
 
				-        # 按位置排序
			
 
				-        all_sub_titles.sort(key=lambda x: x["position"])
			
 
				-
			
 
				-        # 如果没有找到任何子标题，将整个正文块作为一个块
			
 
				-        if not all_sub_titles:
			
 
				-            if len(content_block) > max_chunk_size:
			
 
				-                return self._split_large_chunk(
			
 
				-                    content_block, max_chunk_size, parent_title, 
			
 
				-                    parent_title_info.get("hierarchy_path", [parent_title])
			
 
				-                )
			
 
				-            else:
			
 
				-                return [
			
 
				-                    {
			
 
				-                        "content": content_block,
			
 
				-                        "relative_start": 0,
			
 
				-                        "sub_title": "",
			
 
				-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				-                    }
			
 
				-                ]
			
 
				-
			
 
				-        # 找到直接子标题（parent_level + 1）和所有更深层级的标题
			
 
				-        direct_child_level = parent_level + 1
			
 
				-        direct_child_titles = [sub for sub in all_sub_titles if sub["level"] == direct_child_level]
			
 
				-        
			
 
				-        # 找到最低层级（用于判断哪些是最底层的标题）
			
 
				-        max_level = max(sub["level"] for sub in all_sub_titles) if all_sub_titles else parent_level
			
 
				-        
			
 
				-        print(f"      父标题: {parent_title}, 找到 {len(all_sub_titles)} 个子标题, 直接子标题数: {len(direct_child_titles)}, 最低层级: {max_level}")
			
 
				-
			
 
				-        # 如果没有直接子标题，但有更深层级的标题，使用最低层级标题切分（保持向后兼容）
			
 
				-        if not direct_child_titles and all_sub_titles:
			
 
				-            lowest_level_titles = [sub for sub in all_sub_titles if sub["level"] == max_level]
			
 
				-            print(f"      没有直接子标题，使用最低层级标题切分: {len(lowest_level_titles)} 个")
			
 
				-            direct_child_titles = lowest_level_titles
			
 
				-
			
 
				-        # 按直接子标题切分（如果存在）
			
 
				-        chunks = []
			
 
				-        if direct_child_titles:
			
 
				-            for i, sub_title in enumerate(direct_child_titles):
			
 
				-                start_pos = sub_title["position"]
			
 
				-
			
 
				-                # 确定结束位置（下一个同级或更高级标题的位置）
			
 
				-                # 在 all_sub_titles 中查找下一个位置大于当前标题，且 level <= direct_child_level 的标题
			
 
				-                end_pos = len(content_block)
			
 
				-                for next_sub in all_sub_titles:
			
 
				-                    if next_sub["position"] > start_pos and next_sub["level"] <= direct_child_level:
			
 
				-                        end_pos = next_sub["position"]
			
 
				-                        break
			
 
				-
			
 
				-                chunk_content = content_block[start_pos:end_pos]
			
 
				-                
			
 
				-                # 调试信息
			
 
				-                content_preview = chunk_content[:100].replace("\n", " ")
			
 
				-                print(f"        切分块 {i+1}: {sub_title['title']} (level={sub_title['level']}), 位置: {start_pos}-{end_pos}, 长度: {len(chunk_content)}, 预览: {content_preview}...")
			
 
				-
			
 
				-                # 检查子标题是否有实际正文内容
			
 
				-                title_len = len(sub_title["title"])
			
 
				-                content_after_title = chunk_content[title_len:].strip()
			
 
				-
			
 
				-                if not content_after_title or len(content_after_title) < 10:
			
 
				-                    print(f"        跳过（内容不足）")
			
 
				-                    continue
			
 
				-
			
 
				-                # 构建层级路径
			
 
				-                hierarchy_path = self._build_hierarchy_path_for_subtitle(
			
 
				-                    sub_title["toc_item"], all_toc_items, parent_title_info
			
 
				-                )
			
 
				-
			
 
				-                # 只有当块超过 max_chunk_size 时才按句子切分
			
 
				-                if len(chunk_content) > max_chunk_size:
			
 
				-                    print(f"        块过大，按句子切分")
			
 
				-                    split_chunks = self._split_large_chunk(
			
 
				-                        chunk_content, max_chunk_size, sub_title["title"], hierarchy_path
			
 
				-                    )
			
 
				-                    for split_chunk in split_chunks:
			
 
				-                        split_chunk["relative_start"] = start_pos + split_chunk["relative_start"]
			
 
				-                        split_chunk["sub_title"] = sub_title["title"]
			
 
				-                        if "hierarchy_path" not in split_chunk:
			
 
				-                            split_chunk["hierarchy_path"] = hierarchy_path
			
 
				-                        chunks.append(split_chunk)
			
 
				-                else:
			
 
				-                    # 直接作为一个块
			
 
				-                    chunks.append(
			
 
				-                        {
			
 
				-                            "content": chunk_content,
			
 
				-                            "relative_start": start_pos,
			
 
				-                            "sub_title": sub_title["title"],
			
 
				-                            "hierarchy_path": hierarchy_path,
			
 
				-                        }
			
 
				-                    )
			
 
				-
			
 
				-        # 如果所有子标题都没有正文内容，返回整个正文块
			
 
				-        if not chunks:
			
 
				-            if len(content_block) > max_chunk_size:
			
 
				-                return self._split_large_chunk(
			
 
				-                    content_block, max_chunk_size, parent_title,
			
 
				-                    parent_title_info.get("hierarchy_path", [parent_title])
			
 
				-                )
			
 
				-            else:
			
 
				-                return [
			
 
				-                    {
			
 
				-                        "content": content_block,
			
 
				-                        "relative_start": 0,
			
 
				-                        "sub_title": "",
			
 
				-                        "hierarchy_path": parent_title_info.get("hierarchy_path", [parent_title]),
			
 
				-                    }
			
 
				-                ]
			
 
				-
			
 
				-        return chunks
			
 
				-
			
 
				-    def _find_title_in_block(self, title: str, block: str, fuzzy_threshold: float) -> int:
			
 
				-        """在文本块中查找标题位置（简化版）"""
			
 
				-        # 直接使用 TitleMatcher 的方法
			
 
				-        return self._title_matcher._find_title_in_text(title, block, fuzzy_threshold)
			
 
				-
			
 
				-    def _get_page_from_pos(self, pos: int, pages_content: List[Dict[str, Any]]) -> int:
			
 
				-        """根据位置获取页码"""
			
 
				-        for page in pages_content:
			
 
				-            if page["start_pos"] <= pos < page["end_pos"]:
			
 
				-                return int(page["page_num"])
			
 
				-        return 1
			
--- a/core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
+++ b/core/construction_review/component/doc_worker/docx_worker/toc_extractor.py
@@ -1,123 +0,0 @@
 
				-"""
			
 
				-DOCX 目录提取实现
			
 
				-
			
 
				-参考 docx_toc_detector.py 的逻辑，识别目录行（标题 + 制表符 + 页码）。
			
 
				-"""
			
 
				-
			
 
				-from __future__ import annotations
			
 
				-
			
 
				-import re
			
 
				-from pathlib import Path
			
 
				-from typing import Any, Dict, List
			
 
				-
			
 
				-from docx import Document
			
 
				-
			
 
				-from ..interfaces import TOCExtractor, DocumentSource
			
 
				-from ..utils.toc_level_identifier import TOCLevelIdentifier
			
 
				-from ..utils.toc_pattern_matcher import TOCPatternMatcher
			
 
				-
			
 
				-
			
 
				-class DocxTOCExtractor(TOCExtractor):
			
 
				-    """DOCX 目录提取器"""
			
 
				-
			
 
				-    # 目录行模式：标题 + 制表符 + 页码（页码部分支持带修饰符号，如 ‐ 19 ‐）
			
 
				-    TOC_PATTERN = re.compile(r"^(?P<title>.+?)\t+(?P<page>.*?\d+.*?)\s*$")
			
 
				-
			
 
				-    def __init__(self) -> None:
			
 
				-        """初始化 DOCX 目录提取器"""
			
 
				-        self._level_identifier = TOCLevelIdentifier()
			
 
				-        self._page_extractor = TOCPatternMatcher()
			
 
				-
			
 
				-    def extract_toc(self, source: DocumentSource) -> Dict[str, Any]:
			
 
				-        """
			
 
				-        提取 DOCX 文档的目录信息
			
 
				-        
			
 
				-        返回结构：
			
 
				-        {
			
 
				-            "toc_items": [{"title": str, "page": int, "level": int, "original": str}, ...],
			
 
				-            "toc_count": int,
			
 
				-            "toc_pages": List[int],
			
 
				-        }
			
 
				-        """
			
 
				-        # 加载文档
			
 
				-        if source.path:
			
 
				-            doc = Document(source.path)
			
 
				-        elif source.content:
			
 
				-            from io import BytesIO
			
 
				-            doc = Document(BytesIO(source.content))
			
 
				-        else:
			
 
				-            raise ValueError("DocumentSource 必须提供 path 或 content")
			
 
				-
			
 
				-        # 提取目录行
			
 
				-        toc_items = []
			
 
				-        toc_pages_set = set()
			
 
				-        
			
 
				-        for para in doc.paragraphs:
			
 
				-            text = para.text.strip()
			
 
				-            if "\t" not in text:
			
 
				-                continue
			
 
				-            
			
 
				-            match = self.TOC_PATTERN.match(text)
			
 
				-            if match:
			
 
				-                title = match.group("title").strip()
			
 
				-                page_raw = match.group("page").strip()
			
 
				-                
			
 
				-                # 从可能带有修饰符号的页码中提取纯数字
			
 
				-                page_num_str = self._page_extractor.extract_page_number(page_raw)
			
 
				-                try:
			
 
				-                    page = int(page_num_str)
			
 
				-                except ValueError:
			
 
				-                    # 如果无法转换为整数，跳过该项
			
 
				-                    continue
			
 
				-                
			
 
				-                # 先不设置层级，后续统一识别
			
 
				-                toc_items.append({
			
 
				-                    "title": title,
			
 
				-                    "page": page,
			
 
				-                    "original": text,
			
 
				-                })
			
 
				-                
			
 
				-                toc_pages_set.add(page)
			
 
				-
			
 
				-        # 估算目录所在页（假设目录在前几页）
			
 
				-        if toc_items:
			
 
				-            # 目录页通常是目录项中最小页码之前的页
			
 
				-            min_content_page = min(item["page"] for item in toc_items)
			
 
				-            toc_pages = list(range(1, min(min_content_page, 10)))
			
 
				-        else:
			
 
				-            toc_pages = []
			
 
				-
			
 
				-        # 使用 TOCLevelIdentifier 识别层级（与 doc_worker 保持一致）
			
 
				-        toc_items = self._level_identifier.identify_levels(toc_items)
			
 
				-
			
 
				-        return {
			
 
				-            "toc_items": toc_items,
			
 
				-            "toc_count": len(toc_items),
			
 
				-            "toc_pages": toc_pages,
			
 
				-        }
			
 
				-
			
 
				-    def _detect_level(self, title: str) -> int:
			
 
				-        """
			
 
				-        根据标题格式检测层级（已废弃，保留仅用于向后兼容）
			
 
				-        
			
 
				-        注意：此方法已不再使用，现在使用 TOCLevelIdentifier 统一识别层级。
			
 
				-        保留此方法仅用于向后兼容和测试。
			
 
				-        """
			
 
				-        # 章节格式
			
 
				-        if re.match(r"^第[一二三四五六七八九十\d]+章", title):
			
 
				-            return 1
			
 
				-        
			
 
				-        # 中文编号 + 右括号
			
 
				-        if re.match(r"^[一二三四五六七八九十]+[）)]", title):
			
 
				-            return 2
			
 
				-        
			
 
				-        # 数字 + 顿号/句号
			
 
				-        if re.match(r"^\d+[、．.]", title):
			
 
				-            return 3
			
 
				-        
			
 
				-        # 括号数字
			
 
				-        if re.match(r"^[\(（]\d+[\)）]", title):
			
 
				-            return 4
			
 
				-        
			
 
				-        # 默认 level 2
			
 
				-        return 2
			
--- a/core/construction_review/component/doc_worker/docx_worker/命令
+++ b/core/construction_review/component/doc_worker/docx_worker/命令
@@ -1 +0,0 @@
 
				-python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝（四川境）高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output
			
--- a/core/construction_review/component/doc_worker/utils/text_split_support.py
+++ b/core/construction_review/component/doc_worker/utils/text_split_support.py
@@ -114,7 +114,7 @@ class HierarchicalChunkMixin:
 
				     """
			
 
				     分级目录切分的通用工具 Mixin。
			
 
				 
			
 
				-    把原先 `PdfTextSplitter` / `DocxTextSplitter` 中完全相同的
			
 
				+    把原先 `PdfTextSplitter` 中完全相同的
			
 
				     chunk 元数据构造、层级路径、编号提取等方法抽到这里，
			
 
				     便于多种 worker 复用。
			
 
				     """
			
--- a/core/construction_review/component/doc_worker/命令
+++ b/core/construction_review/component/doc_worker/命令
@@ -1,10 +0,0 @@
 
				-python -m file_parse.docx_worker.cli ".\路桥\47_四川川交路桥有限责任公司会理至禄劝（四川境）高速公路项目土建项目ZCB1-3合同段项目经理部.docx" -l 1 --max-size 3000 --min-size 50 -o ./output
			
 
				-python -m core.construction_review.component.doc_worker.pdf_worker.cli "E:\LLM\dev_v1\files\7a88f0d5-9d82-43bf-b2b1-c2924d67477e.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
			
 
				-
			
 
				-
			
 
				-
			
 
				-python -m file_parse.pdf_worker.cli "Z:\施工方案及编制依据案例库（第一阶段）1205\施工方案文档列表\44_四川公路桥梁建设集团有限公司镇巴（川陕界）至广安高速公路通广段C合同段C4项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
			
 
				-
			
 
				-
			
 
				-
			
 
				-python -m doc_worker.pdf_worker.cli "data\44_四川公路桥梁建设集团有限公司镇巴（川陕界）至广安高速公路通广段C合同段C4项目经理部.pdf" -l 1 --max-size 3000 --min-size 50 -o ./output
			
--- a/core/construction_review/component/document_processor.py
+++ b/core/construction_review/component/document_processor.py
@@ -5,9 +5,11 @@
 
				 
			
 
				 重构说明:
			
 
				 1. 使用类级别共享ChunkClassifier实例，避免重复创建LLM客户端
			
 
				-2. 统一PDF/DOCX处理流程，消除代码重复
			
 
				+2. 统一PDF处理流程，消除代码重复
			
 
				 3. 移除splits冗余数据，统一使用chunks
			
 
				 4. 完善异常处理，记录完整堆栈信息
			
 
				+
			
 
				+注意: DOCX/DOC 文件应在上传层转换为 PDF，本模块不再直接处理 DOCX
			
 
				 """
			
 
				 
			
 
				 import io
			
@@ -31,9 +33,6 @@ try:
 
				     from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
			
 
				     from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
			
 
				     from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
			
 
				-    from .doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
			
 
				-    from .doc_worker.docx_worker.full_text_extractor import DocxFullTextExtractor
			
 
				-    from .doc_worker.docx_worker.text_splitter import DocxTextSplitter
			
 
				     from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
			
 
				     from .doc_worker.classification.chunk_classifier import ChunkClassifier
			
 
				     from .doc_worker.config.provider import default_config_provider
			
@@ -43,9 +42,6 @@ except ImportError:
 
				     from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
			
 
				     from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
			
 
				     from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
			
 
				-    from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
			
 
				-    from core.construction_review.component.doc_worker.docx_worker.full_text_extractor import DocxFullTextExtractor
			
 
				-    from core.construction_review.component.doc_worker.docx_worker.text_splitter import DocxTextSplitter
			
 
				     from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
			
 
				     from core.construction_review.component.doc_worker.classification.chunk_classifier import ChunkClassifier
			
 
				     from core.construction_review.component.doc_worker.config.provider import default_config_provider
			
@@ -158,7 +154,7 @@ class DocumentProcessor:
 
				     _shared_chunk_classifier: Optional[ChunkClassifier] = None
			
 
				 
			
 
				     def __init__(self, progress_manager=None, callback_task_id: str = None, progress_state: dict = None):
			
 
				-        self.supported_types = ['pdf', 'docx']
			
 
				+        self.supported_types = ['pdf']  # DOCX/DOC 应在上传层转换为 PDF
			
 
				         self.config = default_config_provider
			
 
				         # SSE 进度推送（由 DocumentWorkflow 注入）
			
 
				         self._progress_manager = progress_manager
			
@@ -166,24 +162,54 @@ class DocumentProcessor:
 
				         # 与心跳协程共享的状态字典，更新后心跳自动反映新阶段
			
 
				         self._progress_state = progress_state
			
 
				 
			
 
				-        # 初始化各类型文档的处理组件
			
 
				+        # 初始化PDF文档的处理组件
			
 
				         self._components: Dict[str, DocumentComponents] = {
			
 
				             'pdf': DocumentComponents(
			
 
				                 toc_extractor=PdfTOCExtractor(),
			
 
				                 classifier=PdfHierarchyClassifier(),
			
 
				                 fulltext_extractor=HybridFullTextExtractor(),
			
 
				                 text_splitter=PdfTextSplitter()
			
 
				-            ),
			
 
				-            'docx': DocumentComponents(
			
 
				-                toc_extractor=DocxTOCExtractor(),
			
 
				-                classifier=DocxHierarchyClassifier(),
			
 
				-                fulltext_extractor=DocxFullTextExtractor(
			
 
				-                    paragraphs_per_page=int(self.config.get("toc_extraction.paragraphs_per_page", 30))
			
 
				-                ),
			
 
				-                text_splitter=DocxTextSplitter()
			
 
				             )
			
 
				         }
			
 
				 
			
 
				+        # 加载标准分类表并创建序号映射
			
 
				+        self._load_category_seq_mappings()
			
 
				+
			
 
				+    def _load_category_seq_mappings(self):
			
 
				+        """加载标准分类表CSV，创建code到seq的映射"""
			
 
				+        self._first_seq_map: Dict[str, int] = {}  # first_code -> first_seq
			
 
				+        self._second_seq_map: Dict[str, int] = {}  # second_code -> second_seq
			
 
				+
			
 
				+        try:
			
 
				+            import csv
			
 
				+            csv_path = Path(__file__).parent / 'doc_worker' / 'config' / 'StandardCategoryTable.csv'
			
 
				+            if not csv_path.exists():
			
 
				+                logger.warning(f"标准分类表不存在: {csv_path}")
			
 
				+                return
			
 
				+
			
 
				+            with open(csv_path, 'r', encoding='utf-8-sig') as f:
			
 
				+                reader = csv.DictReader(f)
			
 
				+                for row in reader:
			
 
				+                    first_code = row.get('first_code', '').strip()
			
 
				+                    second_code = row.get('second_code', '').strip()
			
 
				+                    try:
			
 
				+                        first_seq = int(row.get('first_seq', 0) or 0)
			
 
				+                    except (ValueError, TypeError):
			
 
				+                        first_seq = 0
			
 
				+                    try:
			
 
				+                        second_seq = int(row.get('second_seq', 0) or 0)
			
 
				+                    except (ValueError, TypeError):
			
 
				+                        second_seq = 0
			
 
				+
			
 
				+                    if first_code and first_code not in self._first_seq_map:
			
 
				+                        self._first_seq_map[first_code] = first_seq
			
 
				+                    if second_code and second_code not in self._second_seq_map:
			
 
				+                        self._second_seq_map[second_code] = second_seq
			
 
				+
			
 
				+            logger.debug(f"加载分类序号映射: 一级 {len(self._first_seq_map)} 个, 二级 {len(self._second_seq_map)} 个")
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"加载分类序号映射失败: {e}")
			
 
				+
			
 
				     @classmethod
			
 
				     def _get_chunk_classifier(cls) -> ChunkClassifier:
			
 
				         """获取共享的ChunkClassifier实例"""
			
@@ -456,10 +482,6 @@ class DocumentProcessor:
 
				             }
			
 
				         }
			
 
				 
			
 
				-        # DOCX额外保留full_text字段
			
 
				-        if file_type == 'docx':
			
 
				-            result['full_text'] = ''.join([page.get('text', '') for page in pages_content])
			
 
				-
			
 
				         return result
			
 
				 
			
 
				     async def _fallback_processing(self, file_content: bytes, file_type: str) -> Dict[str, Any]:
			
@@ -468,15 +490,12 @@ class DocumentProcessor:
 
				 
			
 
				         Args:
			
 
				             file_content: 文件内容
			
 
				-            file_type: 文件类型（pdf/docx）
			
 
				+            file_type: 文件类型（仅支持 pdf）
			
 
				 
			
 
				         Returns:
			
 
				             Dict: 基础处理结果
			
 
				         """
			
 
				-        if file_type == 'pdf':
			
 
				-            return await self._fallback_pdf_processing(file_content)
			
 
				-        else:
			
 
				-            return await self._fallback_docx_processing(file_content)
			
 
				+        return await self._fallback_pdf_processing(file_content)
			
 
				 
			
 
				     async def _fallback_pdf_processing(self, file_content: bytes) -> Dict[str, Any]:
			
 
				         """PDF基础处理模式（当智能处理失败时使用）"""
			
@@ -533,46 +552,6 @@ class DocumentProcessor:
 
				             logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
			
 
				             raise
			
 
				 
			
 
				-    async def _fallback_docx_processing(self, file_content: bytes) -> Dict[str, Any]:
			
 
				-        """DOCX基础处理模式（当智能处理失败时使用）"""
			
 
				-        try:
			
 
				-            from docx import Document
			
 
				-            from io import BytesIO
			
 
				-
			
 
				-            logger.info("使用基础DOCX处理模式（内存模式）")
			
 
				-            doc = Document(BytesIO(file_content))
			
 
				-            full_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
			
 
				-
			
 
				-            # 简单分块，并过滤空内容
			
 
				-            chunks = []
			
 
				-            chunk_size = 1000
			
 
				-            chunk_index = 1
			
 
				-            for i in range(0, len(full_text), chunk_size):
			
 
				-                chunk_text = full_text[i:i+chunk_size].strip()
			
 
				-                if chunk_text:
			
 
				-                    chunks.append({
			
 
				-                        'chunk_id': f'chunk_{chunk_index}',
			
 
				-                        'content': chunk_text,
			
 
				-                        'metadata': {'chunk_index': chunk_index}
			
 
				-                    })
			
 
				-                    chunk_index += 1
			
 
				-
			
 
				-            logger.info(f"基础处理完成，有效分块数量: {len(chunks)}")
			
 
				-
			
 
				-            return {
			
 
				-                'document_type': 'docx',
			
 
				-                'total_chunks': len(chunks),
			
 
				-                'full_text': full_text,
			
 
				-                'chunks': chunks,
			
 
				-                'metadata': {
			
 
				-                    'paragraphs_count': len(doc.paragraphs),
			
 
				-                    'word_count': len(full_text.split())
			
 
				-                }
			
 
				-            }
			
 
				-        except Exception as e:
			
 
				-            logger.error(f"基础DOCX处理失败: {str(e)}", exc_info=True)
			
 
				-            raise
			
 
				-
			
 
				     def structure_content(self, raw_content: Dict[str, Any]) -> Dict[str, Any]:
			
 
				         """结构化处理，适配doc_worker返回的格式"""
			
 
				         try:
			
@@ -589,6 +568,12 @@ class DocumentProcessor:
 
				                     if content:
			
 
				                         metadata = chunk.get('metadata', {})
			
 
				                         element_tag = metadata.get('element_tag', {})
			
 
				+                        chapter_classification = metadata.get('chapter_classification', '')
			
 
				+                        secondary_category_code = metadata.get('secondary_category_code', '')
			
 
				+
			
 
				+                        # 获取序号
			
 
				+                        first_seq = self._first_seq_map.get(chapter_classification, 0)
			
 
				+                        second_seq = self._second_seq_map.get(secondary_category_code, 0)
			
 
				 
			
 
				                         chunks.append({
			
 
				                             'chunk_id': metadata.get('chunk_id', ''),
			
@@ -596,9 +581,11 @@ class DocumentProcessor:
 
				                             'content': content,
			
 
				                             'section_label': metadata.get('section_label', ''),
			
 
				                             'project_plan_type': metadata.get('project_plan_type', ''),
			
 
				-                            'chapter_classification': metadata.get('chapter_classification', ''),
			
 
				+                            'chapter_classification': chapter_classification,
			
 
				+                            'first_seq': first_seq,
			
 
				                             'secondary_category_cn': metadata.get('secondary_category_cn', ''),
			
 
				-                            'secondary_category_code': metadata.get('secondary_category_code', ''),
			
 
				+                            'secondary_category_code': secondary_category_code,
			
 
				+                            'second_seq': second_seq,
			
 
				                             'tertiary_category_cn': metadata.get('tertiary_category_cn', ''),
			
 
				                             'tertiary_category_code': metadata.get('tertiary_category_code', ''),
			
 
				                             # 三级分类详情列表（包含该二级分类下的所有三级分类）
			
@@ -625,17 +612,8 @@ class DocumentProcessor:
 
				                                 'original_content': content[:100] + '...' if len(content) > 100 else content
			
 
				                             })
			
 
				                 else:
			
 
				-                    # DOCX基础处理
			
 
				-                    all_chunks = raw_content.get('chunks', [])
			
 
				+                    # 基础处理结果为空
			
 
				                     chunks = []
			
 
				-                    for chunk in all_chunks:
			
 
				-                        content = chunk.get('content', '').strip()
			
 
				-                        if content:
			
 
				-                            chunks.append({
			
 
				-                                'chunk_id': chunk.get('chunk_id', f'chunk_{len(chunks)+1}'),
			
 
				-                                'content': content,
			
 
				-                                'metadata': chunk.get('metadata', {})
			
 
				-                            })
			
 
				 
			
 
				             # 构建返回结果
			
 
				             result = {
			
--- a/core/construction_review/component/reviewers/completeness_reviewer.py
+++ b/core/construction_review/component/reviewers/completeness_reviewer.py
@@ -27,6 +27,9 @@ class TertiaryItem:
 
				     second_cn: str
			
 
				     third_cn: str
			
 
				     third_focus: str
			
 
				+    first_seq: int = 0
			
 
				+    second_seq: int = 0
			
 
				+    third_seq: int = 0
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -36,6 +39,8 @@ class SecondaryItem:
 
				     second_code: str
			
 
				     first_cn: str
			
 
				     second_cn: str
			
 
				+    first_seq: int = 0
			
 
				+    second_seq: int = 0
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -100,6 +105,20 @@ class TertiarySpecLoader:
 
				                 third_cn = str(row.get('third_name', '')).strip()
			
 
				                 third_focus = str(row.get('third_focus', '')).strip()
			
 
				 
			
 
				+                # 读取序号字段
			
 
				+                try:
			
 
				+                    first_seq = int(row.get('first_seq', 0) or 0)
			
 
				+                except (ValueError, TypeError):
			
 
				+                    first_seq = 0
			
 
				+                try:
			
 
				+                    second_seq = int(row.get('second_seq', 0) or 0)
			
 
				+                except (ValueError, TypeError):
			
 
				+                    second_seq = 0
			
 
				+                try:
			
 
				+                    third_seq = int(row.get('third_seq', 0) or 0)
			
 
				+                except (ValueError, TypeError):
			
 
				+                    third_seq = 0
			
 
				+
			
 
				                 # 动态构建一级分类名称映射
			
 
				                 if first_code and first_cn and first_code not in self.first_names:
			
 
				                     self.first_names[first_code] = first_cn
			
@@ -113,7 +132,10 @@ class TertiarySpecLoader:
 
				                     first_cn=first_cn or self.first_names.get(first_code, first_code),
			
 
				                     second_cn=second_cn,
			
 
				                     third_cn=third_cn,
			
 
				-                    third_focus=third_focus
			
 
				+                    third_focus=third_focus,
			
 
				+                    first_seq=first_seq,
			
 
				+                    second_seq=second_seq,
			
 
				+                    third_seq=third_seq
			
 
				                 )
			
 
				 
			
 
				                 # 存储二级项
			
@@ -123,7 +145,9 @@ class TertiarySpecLoader:
 
				                         first_code=first_code,
			
 
				                         second_code=second_code,
			
 
				                         first_cn=first_cn or self.first_names.get(first_code, first_code),
			
 
				-                        second_cn=second_cn
			
 
				+                        second_cn=second_cn,
			
 
				+                        first_seq=first_seq,
			
 
				+                        second_seq=second_seq
			
 
				                     )
			
 
				         
			
 
				         except Exception as e:
			
@@ -383,13 +407,19 @@ class LightweightCompletenessChecker:
 
				         extra_second = actual_second_keys - required_second
			
 
				 
			
 
				         # 一级缺失详情
			
 
				-        missing_first_details = [
			
 
				-            {
			
 
				+        missing_first_details = []
			
 
				+        for c in sorted(missing_first):
			
 
				+            # 从任意该一级下的二级获取 first_seq
			
 
				+            first_seq = 0
			
 
				+            for (fc, sc), item in self.secondary_specs.items():
			
 
				+                if fc == c:
			
 
				+                    first_seq = item.first_seq
			
 
				+                    break
			
 
				+            missing_first_details.append({
			
 
				                 "first_code": c,
			
 
				-                "first_name": self.spec_loader.first_names.get(c, c)
			
 
				-            }
			
 
				-            for c in sorted(missing_first)
			
 
				-        ]
			
 
				+                "first_name": self.spec_loader.first_names.get(c, c),
			
 
				+                "first_seq": first_seq
			
 
				+            })
			
 
				 
			
 
				         # 二级缺失详情
			
 
				         missing_second_details = []
			
@@ -398,8 +428,10 @@ class LightweightCompletenessChecker:
 
				             missing_second_details.append({
			
 
				                 "first_code": cat1,
			
 
				                 "first_name": item.first_cn if item else self.spec_loader.first_names.get(cat1, cat1),
			
 
				+                "first_seq": item.first_seq if item else 0,
			
 
				                 "secondary_code": cat2,
			
 
				-                "secondary_name": item.second_cn if item else "未知"
			
 
				+                "secondary_name": item.second_cn if item else "未知",
			
 
				+                "second_seq": item.second_seq if item else 0
			
 
				             })
			
 
				 
			
 
				         # 二级多余详情（目录有但标准无）
			
@@ -409,8 +441,10 @@ class LightweightCompletenessChecker:
 
				             extra_second_details.append({
			
 
				                 "first_code": cat1,
			
 
				                 "first_name": self.spec_loader.first_names.get(cat1, cat1),
			
 
				+                "first_seq": item.first_seq if item else 0,
			
 
				                 "secondary_code": cat2,
			
 
				                 "secondary_name": item.second_cn if item else "未知",
			
 
				+                "second_seq": item.second_seq if item else 0,
			
 
				                 "outline_title": outline_secondary.get((cat1, cat2), "")
			
 
				             })
			
 
				 
			
@@ -480,10 +514,13 @@ class LightweightCompletenessChecker:
 
				                 missing_details.append({
			
 
				                     "first_code": cat1,
			
 
				                     "first_name": item.first_cn,
			
 
				+                    "first_seq": item.first_seq,
			
 
				                     "secondary_code": cat2,
			
 
				                     "secondary_name": item.second_cn,
			
 
				+                    "second_seq": item.second_seq,
			
 
				                     "tertiary_code": cat3,
			
 
				                     "tertiary_name": item.third_cn,
			
 
				+                    "third_seq": item.third_seq,
			
 
				                     "focus": item.third_focus
			
 
				                 })
			
 
				         
			
@@ -508,8 +545,10 @@ class LightweightCompletenessChecker:
 
				             secondary_stats_list.append({
			
 
				                 "first_code": cat1,
			
 
				                 "first_name": item.first_cn if item else self.spec_loader.first_names.get(cat1, cat1),
			
 
				+                "first_seq": item.first_seq if item else 0,
			
 
				                 "secondary_code": cat2,
			
 
				                 "secondary_name": item.second_cn if item else "未知",
			
 
				+                "second_seq": item.second_seq if item else 0,
			
 
				                 "total_tertiary": stats["total"],
			
 
				                 "present": stats["present"],
			
 
				                 "missing": stats["missing"],
			
@@ -631,6 +670,12 @@ class LightweightCompletenessChecker:
 
				 
			
 
				         for first_code in sorted(required_first):
			
 
				             first_name = self.spec_loader.first_names.get(first_code, first_code)
			
 
				+            # 获取一级序号
			
 
				+            first_seq = 0
			
 
				+            for (fc, sc), item in self.secondary_specs.items():
			
 
				+                if fc == first_code:
			
 
				+                    first_seq = item.first_seq
			
 
				+                    break
			
 
				 
			
 
				             # ── 一级缺失 ──────────────────────────────────────────────
			
 
				             if first_code not in actual_first:
			
@@ -643,6 +688,7 @@ class LightweightCompletenessChecker:
 
				                         f"根据规范要求，文档必须包含'{first_name}'一级章节，"
			
 
				                         f"当前正文中未发现该章节任何内容"
			
 
				                     ),
			
 
				+                    "first_seq": first_seq,
			
 
				                 })
			
 
				                 continue
			
 
				 
			
@@ -653,6 +699,7 @@ class LightweightCompletenessChecker:
 
				             for (cat1, cat2) in required_second:
			
 
				                 sec_item = self.secondary_specs.get((cat1, cat2))
			
 
				                 second_name = sec_item.second_cn if sec_item else cat2
			
 
				+                second_seq = sec_item.second_seq if sec_item else 0
			
 
				 
			
 
				                 # ── 二级缺失 ──────────────────────────────────────────
			
 
				                 if (cat1, cat2) not in actual_secondary:
			
@@ -667,6 +714,8 @@ class LightweightCompletenessChecker:
 
				                             f"根据规范要求，'{first_name}'下应包含'{second_name}'二级章节，"
			
 
				                             f"当前正文中未发现该章节内容"
			
 
				                         ),
			
 
				+                        "first_seq": first_seq,
			
 
				+                        "second_seq": second_seq,
			
 
				                     })
			
 
				                     continue
			
 
				 
			
@@ -685,29 +734,20 @@ class LightweightCompletenessChecker:
 
				                 if not missing_t_items:
			
 
				                     continue
			
 
				 
			
 
				-                n = len(missing_t_items)
			
 
				-
			
 
				-                # 缺失名称列表（最多展示 5 条）
			
 
				-                missing_labels = [
			
 
				-                    f"{i + 1}.{t.third_cn}" for i, t in enumerate(missing_t_items[:5])
			
 
				-                ]
			
 
				-                if n > 5:
			
 
				-                    missing_labels.append(f"等共{n}项")
			
 
				-                missing_str = "、".join(missing_labels)
			
 
				-
			
 
				-                recommendations.append({
			
 
				-                    "level": "三级",
			
 
				-                    "issue_point": (
			
 
				-                        f"【三级内容缺失】{first_name} > {second_name} 缺少{n}个三级要点：{missing_str}"
			
 
				-                    ),
			
 
				-                    "location": f"{first_name} > {second_name}",
			
 
				-                    "suggestion": (
			
 
				-                        f"请补充'{second_name}'以下{n}个要点内容：{missing_str}"
			
 
				-                    ),
			
 
				-                    "reason": (
			
 
				-                        f"'{second_name}'下缺失以下{n}个规范要求的内容要点：{missing_str}"
			
 
				-                    ),
			
 
				-                })
			
 
				+                # 为每个缺失的三级项创建单独的 recommendation
			
 
				+                for t_item in missing_t_items:
			
 
				+                    recommendations.append({
			
 
				+                        "level": "三级",
			
 
				+                        "issue_point": (
			
 
				+                            f"【三级内容缺失】{first_name} > {second_name} > '{t_item.third_cn}'"
			
 
				+                        ),
			
 
				+                        "location": f"{first_name} > {second_name}",
			
 
				+                        "suggestion": f"请补充'{second_name}'下的'{t_item.third_cn}'内容",
			
 
				+                        "reason": f"'{second_name}'下缺失规范要求的'{t_item.third_cn}'内容要点",
			
 
				+                        "first_seq": first_seq,
			
 
				+                        "second_seq": second_seq,
			
 
				+                        "third_seq": t_item.third_seq,
			
 
				+                    })
			
 
				 
			
 
				         # ── 一致性审查：目录有列但正文无内容 ─────────────────────────────
			
 
				         if outline_result:
			
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/category_loaders.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/category_loaders.py
@@ -102,11 +102,14 @@ class CategoryStandardLoader:
 
				                 self.standards.append(CategoryStandard(
			
 
				                     first_code=row.get('first_code', ''),
			
 
				                     first_name=row.get('first_name', ''),
			
 
				+                    first_seq=int(row.get('first_seq', '0') or 0),
			
 
				                     second_code=row.get('second_code', ''),
			
 
				                     second_name=row.get('second_name', ''),
			
 
				+                    second_seq=int(row.get('second_seq', '0') or 0),
			
 
				                     second_focus=row.get('second_focus', ''),
			
 
				                     third_code=row.get('third_code', ''),
			
 
				                     third_name=row.get('third_name', ''),
			
 
				+                    third_seq=int(row.get('third_seq', '0') or 0),
			
 
				                     third_focus=row.get('third_focus', ''),
			
 
				                     keywords=row.get('keywords', '')
			
 
				                 ))
			
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/content_classifier.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/content_classifier.py
@@ -219,6 +219,7 @@ class ContentClassifierClient:
 
				             default_contents.append(ClassifiedContent(
			
 
				                 third_category_name=std.third_name,
			
 
				                 third_category_code=std.third_code,
			
 
				+                third_seq=std.third_seq,
			
 
				                 start_line=start_line,
			
 
				                 end_line=end_line,
			
 
				                 content=content
			
@@ -466,11 +467,11 @@ class ContentClassifierClient:
 
				             # 支持两种键名: classified_contents 或 classified_contents_list
			
 
				             items = data.get("classified_contents", []) or data.get("classified_contents_list", [])
			
 
				 
			
 
				-            # 构建索引映射表：索引 -> (third_name, third_code)
			
 
				-            index_mapping = {0: ("非标准项", "no_standard")}
			
 
				+            # 构建索引映射表：索引 -> (third_name, third_code, third_seq)
			
 
				+            index_mapping = {0: ("非标准项", "no_standard", 0)}
			
 
				             if section.category_standards:
			
 
				                 for i, std in enumerate(section.category_standards, 1):
			
 
				-                    index_mapping[i] = (std.third_name, std.third_code)
			
 
				+                    index_mapping[i] = (std.third_name, std.third_code, std.third_seq)
			
 
				 
			
 
				             for item in items:
			
 
				                 start_line = item.get("start_line", 0)
			
@@ -479,9 +480,9 @@ class ContentClassifierClient:
 
				                 # 优先使用 category_index 进行映射
			
 
				                 category_index = item.get("category_index")
			
 
				                 if category_index is not None:
			
 
				-                    # 通过索引映射获取标准名称和代码
			
 
				+                    # 通过索引映射获取标准名称、代码和序号
			
 
				                     idx = int(category_index) if isinstance(category_index, (int, float, str)) else 0
			
 
				-                    category_name, category_code = index_mapping.get(idx, ("非标准项", "no_standard"))
			
 
				+                    category_name, category_code, category_seq = index_mapping.get(idx, ("非标准项", "no_standard", 0))
			
 
				                 else:
			
 
				                     # 兼容旧格式：直接读取 third_category_code 和 third_category_name
			
 
				                     category_code = item.get("third_category_code", "")
			
@@ -503,6 +504,7 @@ class ContentClassifierClient:
 
				                 contents.append(ClassifiedContent(
			
 
				                     third_category_name=category_name,
			
 
				                     third_category_code=category_code,
			
 
				+                    third_seq=category_seq,
			
 
				                     start_line=start_line,
			
 
				                     end_line=end_line,
			
 
				                     content=content
			
@@ -552,6 +554,7 @@ class ContentClassifierClient:
 
				                     contents.append(ClassifiedContent(
			
 
				                         third_category_name=category_name,
			
 
				                         third_category_code=category_code,
			
 
				+                        third_seq=0,
			
 
				                         start_line=start_line,
			
 
				                         end_line=end_line,
			
 
				                         content=content
			
@@ -614,6 +617,7 @@ class ContentClassifierClient:
 
				                 merged_contents.append(ClassifiedContent(
			
 
				                     third_category_name=group_contents[0].third_category_name,
			
 
				                     third_category_code=category_code,
			
 
				+                    third_seq=group_contents[0].third_seq,
			
 
				                     start_line=range_info['start'],
			
 
				                     end_line=range_info['end'],
			
 
				                     content=merged_content
			
@@ -771,6 +775,7 @@ class ContentClassifierClient:
 
				                 supplemented.append(ClassifiedContent(
			
 
				                     third_category_name=std.third_name,
			
 
				                     third_category_code=std.third_code,
			
 
				+                    third_seq=std.third_seq,
			
 
				                     start_line=start,
			
 
				                     end_line=end,
			
 
				                     content=content
			
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/main_classifier.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/main_classifier.py
@@ -166,6 +166,7 @@ class LLMContentClassifier:
 
				                     {
			
 
				                         "third_category_name": c.third_category_name,
			
 
				                         "third_category_code": c.third_category_code,
			
 
				+                        "third_seq": c.third_seq,
			
 
				                         "start_line": c.start_line,
			
 
				                         "end_line": c.end_line,
			
 
				                         "content": c.content
			
--- a/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/models.py
+++ b/core/construction_review/component/reviewers/utils/llm_content_classifier_v2/models.py
@@ -13,11 +13,14 @@ class CategoryStandard:
 
				     """标准分类定义"""
			
 
				     first_code: str
			
 
				     first_name: str
			
 
				+    first_seq: int  # 一级序号
			
 
				     second_code: str
			
 
				     second_name: str
			
 
				+    second_seq: int  # 二级序号
			
 
				     second_focus: str  # 二级分类关注点
			
 
				     third_code: str
			
 
				     third_name: str
			
 
				+    third_seq: int  # 三级序号
			
 
				     third_focus: str
			
 
				     keywords: str = ""
			
 
				 
			
@@ -35,6 +38,7 @@ class ClassifiedContent:
 
				     """分类结果"""
			
 
				     third_category_name: str  # 三级分类名称
			
 
				     third_category_code: str  # 三级分类代码
			
 
				+    third_seq: int  # 三级序号
			
 
				     start_line: int
			
 
				     end_line: int
			
 
				     content: str  # 原文内容
			
--- a/requirements.txt
+++ b/requirements.txt
--- a/views/construction_review/file_upload.py
+++ b/views/construction_review/file_upload.py
@@ -6,7 +6,11 @@ import ast
 
				 import traceback
			
 
				 import uuid
			
 
				 import time
			
 
				+import tempfile
			
 
				+import subprocess
			
 
				+import os
			
 
				 from datetime import datetime
			
 
				+from pathlib import Path
			
 
				 
			
 
				 from pydantic import BaseModel, Field
			
 
				 from typing import Optional,List
			
@@ -20,6 +24,156 @@ from core.base.redis_duplicate_checker import RedisDuplicateChecker
 
				 from foundation.infrastructure.tracing import TraceContext, auto_trace
			
 
				 
			
 
				 
			
 
				+def _find_soffice_path() -> str:
			
 
				+    """
			
 
				+    查找 LibreOffice soffice 可执行文件路径
			
 
				+
			
 
				+    Returns:
			
 
				+        str: soffice 可执行文件路径
			
 
				+
			
 
				+    Raises:
			
 
				+        FileNotFoundError: 未找到 LibreOffice
			
 
				+    """
			
 
				+    import platform
			
 
				+
			
 
				+    # Linux/Docker 环境：直接使用 soffice
			
 
				+    if platform.system() != 'Windows':
			
 
				+        return 'soffice'
			
 
				+
			
 
				+    # Windows 环境：检测常见安装路径
			
 
				+    possible_paths = [
			
 
				+        r"C:\Program Files\LibreOffice\program\soffice.exe",
			
 
				+        r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
			
 
				+    ]
			
 
				+
			
 
				+    for path in possible_paths:
			
 
				+        if os.path.exists(path):
			
 
				+            logger.info(f"找到 LibreOffice: {path}")
			
 
				+            return path
			
 
				+
			
 
				+    raise FileNotFoundError(
			
 
				+        "LibreOffice 未安装。请从 https://www.libreoffice.org/download/ 下载安装，"
			
 
				+        "或确保 soffice.exe 在 PATH 中"
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+def convert_docx_to_pdf(docx_content: bytes, filename: str) -> tuple[bytes, str]:
			
 
				+    """
			
 
				+    将 docx/doc 文件内容转换为 PDF
			
 
				+
			
 
				+    Windows 开发环境: 优先使用 docx2pdf (Microsoft Word COM)，回退到 LibreOffice
			
 
				+    Linux/Docker 生产环境: 使用 LibreOffice (soffice)
			
 
				+
			
 
				+    Args:
			
 
				+        docx_content: docx/doc 文件的二进制内容
			
 
				+        filename: 原始文件名（用于生成新的 PDF 文件名）
			
 
				+
			
 
				+    Returns:
			
 
				+        tuple[bytes, str]: (PDF 文件内容, 新的 PDF 文件名)
			
 
				+
			
 
				+    Raises:
			
 
				+        Exception: 转换失败时抛出异常
			
 
				+    """
			
 
				+    import platform
			
 
				+
			
 
				+    # Windows 环境：优先尝试 docx2pdf (Microsoft Word COM)
			
 
				+    if platform.system() == 'Windows':
			
 
				+        try:
			
 
				+            from docx2pdf import convert
			
 
				+            return _convert_via_docx2pdf(docx_content, filename, convert)
			
 
				+        except ImportError:
			
 
				+            logger.info("docx2pdf 未安装，使用 LibreOffice")
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"docx2pdf 转换失败，回退到 LibreOffice: {str(e)}")
			
 
				+
			
 
				+    # Linux/Docker 或 Windows 回退：使用 LibreOffice
			
 
				+    return _convert_via_libreoffice(docx_content, filename)
			
 
				+
			
 
				+
			
 
				+def _convert_via_docx2pdf(docx_content: bytes, filename: str, convert_func) -> tuple[bytes, str]:
			
 
				+    """使用 docx2pdf (Microsoft Word COM) 转换"""
			
 
				+    with tempfile.TemporaryDirectory() as temp_dir:
			
 
				+        temp_dir_path = Path(temp_dir)
			
 
				+
			
 
				+        # 保存原始文件
			
 
				+        original_ext = Path(filename).suffix.lower()
			
 
				+        base_name = Path(filename).stem
			
 
				+        temp_input = temp_dir_path / f"input{original_ext}"
			
 
				+        temp_output = temp_dir_path / "output.pdf"
			
 
				+        temp_input.write_bytes(docx_content)
			
 
				+
			
 
				+        logger.info(f"使用 Microsoft Word 转换 {filename} 为 PDF...")
			
 
				+
			
 
				+        convert_func(str(temp_input), str(temp_output))
			
 
				+
			
 
				+        if not temp_output.exists():
			
 
				+            raise Exception("转换后未找到 PDF 文件")
			
 
				+
			
 
				+        pdf_content = temp_output.read_bytes()
			
 
				+        pdf_filename = f"{base_name}.pdf"
			
 
				+
			
 
				+        logger.info(f"成功转换 {filename} -> {pdf_filename}, PDF 大小: {len(pdf_content) / 1024:.2f} KB")
			
 
				+
			
 
				+        return pdf_content, pdf_filename
			
 
				+
			
 
				+
			
 
				+def _convert_via_libreoffice(docx_content: bytes, filename: str) -> tuple[bytes, str]:
			
 
				+    """使用 LibreOffice (soffice) 转换"""
			
 
				+    # 创建临时目录
			
 
				+    with tempfile.TemporaryDirectory() as temp_dir:
			
 
				+        temp_dir_path = Path(temp_dir)
			
 
				+
			
 
				+        # 保存原始文件到临时目录
			
 
				+        original_ext = Path(filename).suffix.lower()
			
 
				+        base_name = Path(filename).stem
			
 
				+        temp_input = temp_dir_path / f"input{original_ext}"
			
 
				+        temp_input.write_bytes(docx_content)
			
 
				+
			
 
				+        logger.info(f"使用 LibreOffice 转换 {filename} 为 PDF...")
			
 
				+
			
 
				+        # 查找 LibreOffice 路径
			
 
				+        try:
			
 
				+            soffice_path = _find_soffice_path()
			
 
				+        except FileNotFoundError as e:
			
 
				+            logger.error(str(e))
			
 
				+            raise Exception(str(e))
			
 
				+
			
 
				+        # 使用 LibreOffice 转换
			
 
				+        try:
			
 
				+            result = subprocess.run(
			
 
				+                [
			
 
				+                    soffice_path, '--headless', '--convert-to', 'pdf',
			
 
				+                    '--outdir', str(temp_dir_path),
			
 
				+                    str(temp_input)
			
 
				+                ],
			
 
				+                capture_output=True,
			
 
				+                text=True,
			
 
				+                timeout=120  # 2分钟超时
			
 
				+            )
			
 
				+
			
 
				+            if result.returncode != 0:
			
 
				+                logger.error(f"LibreOffice 转换失败: {result.stderr}")
			
 
				+                raise Exception(f"LibreOffice 转换失败: {result.stderr}")
			
 
				+
			
 
				+            # 查找生成的 PDF 文件
			
 
				+            pdf_files = list(temp_dir_path.glob("*.pdf"))
			
 
				+            if not pdf_files:
			
 
				+                raise Exception("转换后未找到 PDF 文件")
			
 
				+
			
 
				+            pdf_file = pdf_files[0]
			
 
				+            pdf_content = pdf_file.read_bytes()
			
 
				+            pdf_filename = f"{base_name}.pdf"
			
 
				+
			
 
				+            logger.info(f"成功转换 {filename} -> {pdf_filename}, PDF 大小: {len(pdf_content) / 1024:.2f} KB")
			
 
				+
			
 
				+            return pdf_content, pdf_filename
			
 
				+
			
 
				+        except subprocess.TimeoutExpired:
			
 
				+            raise Exception("LibreOffice 转换超时")
			
 
				+        except FileNotFoundError:
			
 
				+            raise Exception("LibreOffice 未安装或 soffice 命令不可用")
			
 
				+
			
 
				+
			
 
				 file_upload_router = APIRouter(prefix="/sgsc", tags=["前端接口"])
			
 
				 uploaded_files = {}
			
 
				 # 初始化工作流管理器
			
@@ -153,10 +307,27 @@ async def file_upload(
 
				 
			
 
				         # 确定文件类型
			
 
				         file_extension = file[0].filename.split('.')[-1].lower() if '.' in file[0].filename else ''
			
 
				+        original_filename = file[0].filename  # 保存原始文件名
			
 
				+
			
 
				         if content.startswith(b'%PDF'):
			
 
				             file_type = 'pdf'
			
 
				         elif content.startswith(b'PK\x03\x04') and file_extension in ['docx', 'doc']:
			
 
				-            file_type = 'docx'
			
 
				+            # 检测到 docx/doc 文件，转换为 PDF
			
 
				+            logger.info(f"检测到 {file_extension} 文件，正在转换为 PDF...")
			
 
				+            try:
			
 
				+                pdf_content, pdf_filename = convert_docx_to_pdf(content, original_filename)
			
 
				+                # 更新文件内容和相关信息
			
 
				+                content = pdf_content
			
 
				+                original_filename = pdf_filename
			
 
				+                file_type = 'pdf'  # 标记为 PDF 类型，后续流程按 PDF 处理
			
 
				+                file_size = len(pdf_content)
			
 
				+                file_size_mb = round(file_size / (1024 * 1024), 2)
			
 
				+                # 重新生成 MD5（基于转换后的 PDF）
			
 
				+                file_id = md5.md5_id(content)
			
 
				+                logger.info(f"文件已转换为 PDF: {pdf_filename}, 大小: {file_size_mb} MB")
			
 
				+            except Exception as convert_error:
			
 
				+                logger.error(f"docx 转 PDF 失败: {str(convert_error)}")
			
 
				+                raise FileUploadErrors.internal_error(f"文档转换失败: {str(convert_error)}")
			
 
				         else:
			
 
				             file_type = 'unknown'
			
 
				 
			
@@ -172,7 +343,7 @@ async def file_upload(
 
				                 'user_id': user,
			
 
				                 'file_type': file_type,
			
 
				                 'callback_task_id': callback_task_id,
			
 
				-                "file_name": file[0].filename,
			
 
				+                "file_name": original_filename,  # 使用转换后的文件名（docx 转 PDF 后会更新）
			
 
				                 "file_size": file_size_mb,
			
 
				                 'updated_at': created_at
			
 
				             }