|
@@ -5,9 +5,11 @@
|
|
|
|
|
|
|
|
重构说明:
|
|
重构说明:
|
|
|
1. 使用类级别共享ChunkClassifier实例,避免重复创建LLM客户端
|
|
1. 使用类级别共享ChunkClassifier实例,避免重复创建LLM客户端
|
|
|
-2. 统一PDF/DOCX处理流程,消除代码重复
|
|
|
|
|
|
|
+2. 统一PDF处理流程,消除代码重复
|
|
|
3. 移除splits冗余数据,统一使用chunks
|
|
3. 移除splits冗余数据,统一使用chunks
|
|
|
4. 完善异常处理,记录完整堆栈信息
|
|
4. 完善异常处理,记录完整堆栈信息
|
|
|
|
|
+
|
|
|
|
|
+注意: DOCX/DOC 文件应在上传层转换为 PDF,本模块不再直接处理 DOCX
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
import io
|
|
import io
|
|
@@ -31,9 +33,6 @@ try:
|
|
|
from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
|
|
from .doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
|
|
|
from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
|
|
from .doc_worker.pdf_worker.text_splitter import PdfTextSplitter
|
|
|
from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
|
|
from .doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
|
|
|
- from .doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
|
|
|
|
|
- from .doc_worker.docx_worker.full_text_extractor import DocxFullTextExtractor
|
|
|
|
|
- from .doc_worker.docx_worker.text_splitter import DocxTextSplitter
|
|
|
|
|
from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
|
|
from .doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
|
|
|
from .doc_worker.classification.chunk_classifier import ChunkClassifier
|
|
from .doc_worker.classification.chunk_classifier import ChunkClassifier
|
|
|
from .doc_worker.config.provider import default_config_provider
|
|
from .doc_worker.config.provider import default_config_provider
|
|
@@ -43,9 +42,6 @@ except ImportError:
|
|
|
from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
|
|
from core.construction_review.component.doc_worker.pdf_worker.hybrid_extractor import HybridFullTextExtractor
|
|
|
from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
|
|
from core.construction_review.component.doc_worker.pdf_worker.text_splitter import PdfTextSplitter
|
|
|
from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
|
|
from core.construction_review.component.doc_worker.pdf_worker.classifier import PdfHierarchyClassifier
|
|
|
- from core.construction_review.component.doc_worker.docx_worker.toc_extractor import DocxTOCExtractor
|
|
|
|
|
- from core.construction_review.component.doc_worker.docx_worker.full_text_extractor import DocxFullTextExtractor
|
|
|
|
|
- from core.construction_review.component.doc_worker.docx_worker.text_splitter import DocxTextSplitter
|
|
|
|
|
from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
|
|
from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier as DocxHierarchyClassifier
|
|
|
from core.construction_review.component.doc_worker.classification.chunk_classifier import ChunkClassifier
|
|
from core.construction_review.component.doc_worker.classification.chunk_classifier import ChunkClassifier
|
|
|
from core.construction_review.component.doc_worker.config.provider import default_config_provider
|
|
from core.construction_review.component.doc_worker.config.provider import default_config_provider
|
|
@@ -158,7 +154,7 @@ class DocumentProcessor:
|
|
|
_shared_chunk_classifier: Optional[ChunkClassifier] = None
|
|
_shared_chunk_classifier: Optional[ChunkClassifier] = None
|
|
|
|
|
|
|
|
def __init__(self, progress_manager=None, callback_task_id: str = None, progress_state: dict = None):
|
|
def __init__(self, progress_manager=None, callback_task_id: str = None, progress_state: dict = None):
|
|
|
- self.supported_types = ['pdf', 'docx']
|
|
|
|
|
|
|
+ self.supported_types = ['pdf'] # DOCX/DOC 应在上传层转换为 PDF
|
|
|
self.config = default_config_provider
|
|
self.config = default_config_provider
|
|
|
# SSE 进度推送(由 DocumentWorkflow 注入)
|
|
# SSE 进度推送(由 DocumentWorkflow 注入)
|
|
|
self._progress_manager = progress_manager
|
|
self._progress_manager = progress_manager
|
|
@@ -166,24 +162,54 @@ class DocumentProcessor:
|
|
|
# 与心跳协程共享的状态字典,更新后心跳自动反映新阶段
|
|
# 与心跳协程共享的状态字典,更新后心跳自动反映新阶段
|
|
|
self._progress_state = progress_state
|
|
self._progress_state = progress_state
|
|
|
|
|
|
|
|
- # 初始化各类型文档的处理组件
|
|
|
|
|
|
|
+ # 初始化PDF文档的处理组件
|
|
|
self._components: Dict[str, DocumentComponents] = {
|
|
self._components: Dict[str, DocumentComponents] = {
|
|
|
'pdf': DocumentComponents(
|
|
'pdf': DocumentComponents(
|
|
|
toc_extractor=PdfTOCExtractor(),
|
|
toc_extractor=PdfTOCExtractor(),
|
|
|
classifier=PdfHierarchyClassifier(),
|
|
classifier=PdfHierarchyClassifier(),
|
|
|
fulltext_extractor=HybridFullTextExtractor(),
|
|
fulltext_extractor=HybridFullTextExtractor(),
|
|
|
text_splitter=PdfTextSplitter()
|
|
text_splitter=PdfTextSplitter()
|
|
|
- ),
|
|
|
|
|
- 'docx': DocumentComponents(
|
|
|
|
|
- toc_extractor=DocxTOCExtractor(),
|
|
|
|
|
- classifier=DocxHierarchyClassifier(),
|
|
|
|
|
- fulltext_extractor=DocxFullTextExtractor(
|
|
|
|
|
- paragraphs_per_page=int(self.config.get("toc_extraction.paragraphs_per_page", 30))
|
|
|
|
|
- ),
|
|
|
|
|
- text_splitter=DocxTextSplitter()
|
|
|
|
|
)
|
|
)
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ # 加载标准分类表并创建序号映射
|
|
|
|
|
+ self._load_category_seq_mappings()
|
|
|
|
|
+
|
|
|
|
|
+ def _load_category_seq_mappings(self):
|
|
|
|
|
+ """加载标准分类表CSV,创建code到seq的映射"""
|
|
|
|
|
+ self._first_seq_map: Dict[str, int] = {} # first_code -> first_seq
|
|
|
|
|
+ self._second_seq_map: Dict[str, int] = {} # second_code -> second_seq
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ import csv
|
|
|
|
|
+ csv_path = Path(__file__).parent / 'doc_worker' / 'config' / 'StandardCategoryTable.csv'
|
|
|
|
|
+ if not csv_path.exists():
|
|
|
|
|
+ logger.warning(f"标准分类表不存在: {csv_path}")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ with open(csv_path, 'r', encoding='utf-8-sig') as f:
|
|
|
|
|
+ reader = csv.DictReader(f)
|
|
|
|
|
+ for row in reader:
|
|
|
|
|
+ first_code = row.get('first_code', '').strip()
|
|
|
|
|
+ second_code = row.get('second_code', '').strip()
|
|
|
|
|
+ try:
|
|
|
|
|
+ first_seq = int(row.get('first_seq', 0) or 0)
|
|
|
|
|
+ except (ValueError, TypeError):
|
|
|
|
|
+ first_seq = 0
|
|
|
|
|
+ try:
|
|
|
|
|
+ second_seq = int(row.get('second_seq', 0) or 0)
|
|
|
|
|
+ except (ValueError, TypeError):
|
|
|
|
|
+ second_seq = 0
|
|
|
|
|
+
|
|
|
|
|
+ if first_code and first_code not in self._first_seq_map:
|
|
|
|
|
+ self._first_seq_map[first_code] = first_seq
|
|
|
|
|
+ if second_code and second_code not in self._second_seq_map:
|
|
|
|
|
+ self._second_seq_map[second_code] = second_seq
|
|
|
|
|
+
|
|
|
|
|
+ logger.debug(f"加载分类序号映射: 一级 {len(self._first_seq_map)} 个, 二级 {len(self._second_seq_map)} 个")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning(f"加载分类序号映射失败: {e}")
|
|
|
|
|
+
|
|
|
@classmethod
|
|
@classmethod
|
|
|
def _get_chunk_classifier(cls) -> ChunkClassifier:
|
|
def _get_chunk_classifier(cls) -> ChunkClassifier:
|
|
|
"""获取共享的ChunkClassifier实例"""
|
|
"""获取共享的ChunkClassifier实例"""
|
|
@@ -456,10 +482,6 @@ class DocumentProcessor:
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- # DOCX额外保留full_text字段
|
|
|
|
|
- if file_type == 'docx':
|
|
|
|
|
- result['full_text'] = ''.join([page.get('text', '') for page in pages_content])
|
|
|
|
|
-
|
|
|
|
|
return result
|
|
return result
|
|
|
|
|
|
|
|
async def _fallback_processing(self, file_content: bytes, file_type: str) -> Dict[str, Any]:
|
|
async def _fallback_processing(self, file_content: bytes, file_type: str) -> Dict[str, Any]:
|
|
@@ -468,15 +490,12 @@ class DocumentProcessor:
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
file_content: 文件内容
|
|
file_content: 文件内容
|
|
|
- file_type: 文件类型(pdf/docx)
|
|
|
|
|
|
|
+ file_type: 文件类型(仅支持 pdf)
|
|
|
|
|
|
|
|
Returns:
|
|
Returns:
|
|
|
Dict: 基础处理结果
|
|
Dict: 基础处理结果
|
|
|
"""
|
|
"""
|
|
|
- if file_type == 'pdf':
|
|
|
|
|
- return await self._fallback_pdf_processing(file_content)
|
|
|
|
|
- else:
|
|
|
|
|
- return await self._fallback_docx_processing(file_content)
|
|
|
|
|
|
|
+ return await self._fallback_pdf_processing(file_content)
|
|
|
|
|
|
|
|
async def _fallback_pdf_processing(self, file_content: bytes) -> Dict[str, Any]:
|
|
async def _fallback_pdf_processing(self, file_content: bytes) -> Dict[str, Any]:
|
|
|
"""PDF基础处理模式(当智能处理失败时使用)"""
|
|
"""PDF基础处理模式(当智能处理失败时使用)"""
|
|
@@ -533,46 +552,6 @@ class DocumentProcessor:
|
|
|
logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
|
|
logger.error(f"基础PDF处理失败: {str(e)}", exc_info=True)
|
|
|
raise
|
|
raise
|
|
|
|
|
|
|
|
- async def _fallback_docx_processing(self, file_content: bytes) -> Dict[str, Any]:
|
|
|
|
|
- """DOCX基础处理模式(当智能处理失败时使用)"""
|
|
|
|
|
- try:
|
|
|
|
|
- from docx import Document
|
|
|
|
|
- from io import BytesIO
|
|
|
|
|
-
|
|
|
|
|
- logger.info("使用基础DOCX处理模式(内存模式)")
|
|
|
|
|
- doc = Document(BytesIO(file_content))
|
|
|
|
|
- full_text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
|
|
|
|
|
-
|
|
|
|
|
- # 简单分块,并过滤空内容
|
|
|
|
|
- chunks = []
|
|
|
|
|
- chunk_size = 1000
|
|
|
|
|
- chunk_index = 1
|
|
|
|
|
- for i in range(0, len(full_text), chunk_size):
|
|
|
|
|
- chunk_text = full_text[i:i+chunk_size].strip()
|
|
|
|
|
- if chunk_text:
|
|
|
|
|
- chunks.append({
|
|
|
|
|
- 'chunk_id': f'chunk_{chunk_index}',
|
|
|
|
|
- 'content': chunk_text,
|
|
|
|
|
- 'metadata': {'chunk_index': chunk_index}
|
|
|
|
|
- })
|
|
|
|
|
- chunk_index += 1
|
|
|
|
|
-
|
|
|
|
|
- logger.info(f"基础处理完成,有效分块数量: {len(chunks)}")
|
|
|
|
|
-
|
|
|
|
|
- return {
|
|
|
|
|
- 'document_type': 'docx',
|
|
|
|
|
- 'total_chunks': len(chunks),
|
|
|
|
|
- 'full_text': full_text,
|
|
|
|
|
- 'chunks': chunks,
|
|
|
|
|
- 'metadata': {
|
|
|
|
|
- 'paragraphs_count': len(doc.paragraphs),
|
|
|
|
|
- 'word_count': len(full_text.split())
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- except Exception as e:
|
|
|
|
|
- logger.error(f"基础DOCX处理失败: {str(e)}", exc_info=True)
|
|
|
|
|
- raise
|
|
|
|
|
-
|
|
|
|
|
def structure_content(self, raw_content: Dict[str, Any]) -> Dict[str, Any]:
|
|
def structure_content(self, raw_content: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
"""结构化处理,适配doc_worker返回的格式"""
|
|
"""结构化处理,适配doc_worker返回的格式"""
|
|
|
try:
|
|
try:
|
|
@@ -589,6 +568,12 @@ class DocumentProcessor:
|
|
|
if content:
|
|
if content:
|
|
|
metadata = chunk.get('metadata', {})
|
|
metadata = chunk.get('metadata', {})
|
|
|
element_tag = metadata.get('element_tag', {})
|
|
element_tag = metadata.get('element_tag', {})
|
|
|
|
|
+ chapter_classification = metadata.get('chapter_classification', '')
|
|
|
|
|
+ secondary_category_code = metadata.get('secondary_category_code', '')
|
|
|
|
|
+
|
|
|
|
|
+ # 获取序号
|
|
|
|
|
+ first_seq = self._first_seq_map.get(chapter_classification, 0)
|
|
|
|
|
+ second_seq = self._second_seq_map.get(secondary_category_code, 0)
|
|
|
|
|
|
|
|
chunks.append({
|
|
chunks.append({
|
|
|
'chunk_id': metadata.get('chunk_id', ''),
|
|
'chunk_id': metadata.get('chunk_id', ''),
|
|
@@ -596,9 +581,11 @@ class DocumentProcessor:
|
|
|
'content': content,
|
|
'content': content,
|
|
|
'section_label': metadata.get('section_label', ''),
|
|
'section_label': metadata.get('section_label', ''),
|
|
|
'project_plan_type': metadata.get('project_plan_type', ''),
|
|
'project_plan_type': metadata.get('project_plan_type', ''),
|
|
|
- 'chapter_classification': metadata.get('chapter_classification', ''),
|
|
|
|
|
|
|
+ 'chapter_classification': chapter_classification,
|
|
|
|
|
+ 'first_seq': first_seq,
|
|
|
'secondary_category_cn': metadata.get('secondary_category_cn', ''),
|
|
'secondary_category_cn': metadata.get('secondary_category_cn', ''),
|
|
|
- 'secondary_category_code': metadata.get('secondary_category_code', ''),
|
|
|
|
|
|
|
+ 'secondary_category_code': secondary_category_code,
|
|
|
|
|
+ 'second_seq': second_seq,
|
|
|
'tertiary_category_cn': metadata.get('tertiary_category_cn', ''),
|
|
'tertiary_category_cn': metadata.get('tertiary_category_cn', ''),
|
|
|
'tertiary_category_code': metadata.get('tertiary_category_code', ''),
|
|
'tertiary_category_code': metadata.get('tertiary_category_code', ''),
|
|
|
# 三级分类详情列表(包含该二级分类下的所有三级分类)
|
|
# 三级分类详情列表(包含该二级分类下的所有三级分类)
|
|
@@ -625,17 +612,8 @@ class DocumentProcessor:
|
|
|
'original_content': content[:100] + '...' if len(content) > 100 else content
|
|
'original_content': content[:100] + '...' if len(content) > 100 else content
|
|
|
})
|
|
})
|
|
|
else:
|
|
else:
|
|
|
- # DOCX基础处理
|
|
|
|
|
- all_chunks = raw_content.get('chunks', [])
|
|
|
|
|
|
|
+ # 基础处理结果为空
|
|
|
chunks = []
|
|
chunks = []
|
|
|
- for chunk in all_chunks:
|
|
|
|
|
- content = chunk.get('content', '').strip()
|
|
|
|
|
- if content:
|
|
|
|
|
- chunks.append({
|
|
|
|
|
- 'chunk_id': chunk.get('chunk_id', f'chunk_{len(chunks)+1}'),
|
|
|
|
|
- 'content': content,
|
|
|
|
|
- 'metadata': chunk.get('metadata', {})
|
|
|
|
|
- })
|
|
|
|
|
|
|
|
|
|
# 构建返回结果
|
|
# 构建返回结果
|
|
|
result = {
|
|
result = {
|