| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546 |
- """
- 文档分类切分库
- 支持PDF和Word文档的目录提取、智能分类和文本切分
- 主要功能:
- 1. 提取PDF/Word文档的目录结构
- 2. 识别和校验目录的层级关系
- 3. 基于二级目录关键词匹配对一级目录进行智能分类
- 4. 按目录层级和字符数智能切分文本
- 5. 保存分类结果到多种格式
- 使用示例:
- from doc_worker import DocumentClassifier
-
- # 创建分类器实例
- classifier = DocumentClassifier()
-
- # 处理文档
- result = classifier.process_document(
- file_path="document.pdf",
- target_level=1, # 对一级目录进行分类
- output_dir="./output"
- )
- """
- __version__ = "2.0.0"
- __author__ = "Your Name"
- from core.construction_review.component.doc_worker import DocumentClassifier
- from core.construction_review.component.doc_worker.toc.toc_extractor import TOCExtractor
- from core.construction_review.component.doc_worker.chunking.text_splitter import TextSplitter
- from core.construction_review.component.doc_worker.classification.hierarchy_classifier import HierarchyClassifier
- from core.construction_review.component.doc_worker.classification.rule_based_classifier import RuleBasedClassifier
- from core.construction_review.component.doc_worker.output.result_saver import ResultSaver
- __all__ = [
- 'DocumentClassifier',
- 'TOCExtractor',
- 'TextSplitter',
- 'HierarchyClassifier',
- 'RuleBasedClassifier',
- 'ResultSaver'
- ]
|