pdf_split_handle.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. # coding=utf-8
  2. """
  3. @project: maxkb
  4. @Author:虎
  5. @file: text_split_handle.py
  6. @date:2024/3/27 18:19
  7. @desc:
  8. """
  9. import os
  10. import re
  11. import tempfile
  12. import time
  13. import traceback
  14. from typing import List
  15. import fitz
  16. from django.utils.translation import gettext_lazy as _
  17. from common.handle.base_split_handle import BaseSplitHandle
  18. from common.utils.logger import maxkb_logger
  19. from common.utils.split_model import SplitModel, smart_split_paragraph
  20. default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
  21. re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
  22. re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
  23. re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
  24. re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
  25. re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*"),
  26. re.compile("(?<!\n)\n\n+")]
  27. def check_links_in_pdf(doc):
  28. for page_number in range(len(doc)):
  29. page = doc[page_number]
  30. links = page.get_links()
  31. if links:
  32. for link in links:
  33. if link['kind'] == 1:
  34. return True
  35. return False
  36. class PdfSplitHandle(BaseSplitHandle):
  37. def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
  38. with tempfile.NamedTemporaryFile(delete=False) as temp_file:
  39. # 将上传的文件保存到临时文件中
  40. for chunk in file.chunks():
  41. temp_file.write(chunk)
  42. # 获取临时文件的路径
  43. temp_file_path = temp_file.name
  44. pdf_document = fitz.open(temp_file_path)
  45. try:
  46. if type(limit) is str:
  47. limit = int(limit)
  48. if type(with_filter) is str:
  49. with_filter = with_filter.lower() == 'true'
  50. # 处理有目录的pdf
  51. result = self.handle_toc(pdf_document, limit)
  52. if result is not None:
  53. return {'name': file.name, 'content': result}
  54. # 没目录但是有链接的pdf
  55. result = self.handle_links(pdf_document, pattern_list, with_filter, limit)
  56. if result is not None and len(result) > 0:
  57. return {'name': file.name, 'content': result}
  58. # 没有目录的pdf
  59. content = self.handle_pdf_content(file, pdf_document)
  60. if pattern_list is not None and len(pattern_list) > 0:
  61. split_model = SplitModel(pattern_list, with_filter, limit)
  62. else:
  63. split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
  64. except BaseException as e:
  65. maxkb_logger.error(f"File: {file.name}, error: {e}, {traceback.format_exc()}")
  66. return {
  67. 'name': file.name,
  68. 'content': []
  69. }
  70. finally:
  71. pdf_document.close()
  72. # 处理完后可以删除临时文件
  73. os.remove(temp_file_path)
  74. return {
  75. 'name': file.name,
  76. 'content': split_model.parse(content)
  77. }
  78. @staticmethod
  79. def handle_pdf_content(file, pdf_document):
  80. # 第一步:收集所有字体大小
  81. font_sizes = []
  82. for page_num in range(len(pdf_document)):
  83. page = pdf_document.load_page(page_num)
  84. blocks = page.get_text("dict")["blocks"]
  85. for block in blocks:
  86. if block["type"] == 0:
  87. for line in block["lines"]:
  88. for span in line["spans"]:
  89. if span["size"] > 0:
  90. font_sizes.append(span["size"])
  91. # 计算正文字体大小(众数)
  92. if not font_sizes:
  93. body_font_size = 12
  94. else:
  95. from collections import Counter
  96. body_font_size = Counter(font_sizes).most_common(1)[0][0]
  97. # 第二步:提取内容
  98. content = ""
  99. for page_num in range(len(pdf_document)):
  100. start_time = time.time()
  101. page = pdf_document.load_page(page_num)
  102. blocks = page.get_text("dict")["blocks"]
  103. for block in blocks:
  104. if block["type"] == 0: # 文本块
  105. for line in block["lines"]:
  106. if not line["spans"]:
  107. continue
  108. text = "".join([span["text"] for span in line["spans"]])
  109. font_size = line["spans"][0]["size"]
  110. # 根据与正文字体的差值判断
  111. size_diff = font_size - body_font_size
  112. if size_diff > 2: # 明显大于正文
  113. content += f"## {text}\n\n"
  114. elif size_diff > 0.5: # 略大于正文
  115. content += f"### {text}\n\n"
  116. else: # 正文
  117. content += f"{text}\n"
  118. elif block["type"] == 1: # 图片块
  119. content += f"![image](image_{page_num}_{block['number']})\n\n"
  120. content = content.replace('\0', '')
  121. elapsed_time = time.time() - start_time
  122. maxkb_logger.debug(
  123. f"File: {file.name}, Page: {page_num + 1}, Time: {elapsed_time:.3f}s")
  124. return content
  125. @staticmethod
  126. def handle_toc(doc, limit):
  127. # 找到目录
  128. toc = doc.get_toc()
  129. if toc is None or len(toc) == 0:
  130. return None
  131. # 创建存储章节内容的数组
  132. chapters = []
  133. # 遍历目录并按章节提取文本
  134. for i, entry in enumerate(toc):
  135. level, title, start_page = entry
  136. start_page -= 1 # PyMuPDF 页码从 0 开始,书签页码从 1 开始
  137. chapter_title = title
  138. # 确定结束页码,如果是最后一个章节则到文档末尾
  139. if i + 1 < len(toc):
  140. end_page = toc[i + 1][2] - 1
  141. else:
  142. end_page = doc.page_count - 1
  143. # 去掉标题中的符号
  144. title = PdfSplitHandle.handle_chapter_title(title)
  145. # 提取该章节的文本内容
  146. chapter_text = ""
  147. for page_num in range(start_page, end_page + 1):
  148. page = doc.load_page(page_num) # 加载页面
  149. text = page.get_text("text")
  150. text = re.sub(r'(?<!。)\n+', '', text)
  151. text = re.sub(r'(?<!.)\n+', '', text)
  152. # print(f'title: {title}')
  153. idx = text.find(title)
  154. if idx > -1:
  155. text = text[idx + len(title):]
  156. if i + 1 < len(toc):
  157. l, next_title, next_start_page = toc[i + 1]
  158. next_title = PdfSplitHandle.handle_chapter_title(next_title)
  159. # print(f'next_title: {next_title}')
  160. idx = text.find(next_title)
  161. if idx > -1:
  162. text = text[:idx]
  163. chapter_text += text # 提取文本
  164. # Null characters are not allowed.
  165. chapter_text = chapter_text.replace('\0', '')
  166. # 限制标题长度
  167. real_chapter_title = chapter_title[:256]
  168. # 限制章节内容长度
  169. if 0 < limit < len(chapter_text):
  170. split_text = smart_split_paragraph(chapter_text, limit)
  171. for text in split_text:
  172. chapters.append({"title": real_chapter_title, "content": text})
  173. else:
  174. chapters.append(
  175. {"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
  176. # 保存章节内容和章节标题
  177. return chapters
  178. @staticmethod
  179. def handle_links(doc, pattern_list, with_filter, limit):
  180. # 检查文档是否包含内部链接
  181. if not check_links_in_pdf(doc):
  182. return
  183. # 创建存储章节内容的数组
  184. chapters = []
  185. toc_start_page = -1
  186. page_content = ""
  187. handle_pre_toc = True
  188. # 遍历 PDF 的每一页,查找带有目录链接的页
  189. for page_num in range(doc.page_count):
  190. page = doc.load_page(page_num)
  191. links = page.get_links()
  192. # 如果目录开始页码未设置,则设置为当前页码
  193. if len(links) > 0:
  194. toc_start_page = page_num
  195. if toc_start_page < 0:
  196. page_content += page.get_text('text')
  197. # 检查该页是否包含内部链接(即指向文档内部的页面)
  198. for num in range(len(links)):
  199. link = links[num]
  200. if link['kind'] == 1: # 'kind' 为 1 表示内部链接
  201. # 获取链接目标的页面
  202. dest_page = link['page']
  203. rect = link['from'] # 获取链接的矩形区域
  204. # 如果目录开始页码包括前言部分,则不处理前言部分
  205. if dest_page < toc_start_page:
  206. handle_pre_toc = False
  207. # 提取链接区域的文本作为标题
  208. link_title = page.get_text("text", clip=rect).strip().split("\n")[0].replace('.', '').strip()
  209. # print(f'link_title: {link_title}')
  210. # 提取目标页面内容作为章节开始
  211. start_page = dest_page
  212. end_page = dest_page
  213. # 下一个link
  214. next_link = links[num + 1] if num + 1 < len(links) else None
  215. next_link_title = None
  216. if next_link is not None and next_link['kind'] == 1:
  217. rect = next_link['from']
  218. next_link_title = page.get_text("text", clip=rect).strip() \
  219. .split("\n")[0].replace('.', '').strip()
  220. # print(f'next_link_title: {next_link_title}')
  221. end_page = next_link['page']
  222. # 提取章节内容
  223. chapter_text = ""
  224. for p_num in range(start_page, end_page + 1):
  225. p = doc.load_page(p_num)
  226. text = p.get_text("text")
  227. text = re.sub(r'(?<!。)\n+', '', text)
  228. text = re.sub(r'(?<!.)\n+', '', text)
  229. # print(f'\n{text}\n')
  230. idx = text.find(link_title)
  231. if idx > -1:
  232. text = text[idx + len(link_title):]
  233. if next_link_title is not None:
  234. idx = text.find(next_link_title)
  235. if idx > -1:
  236. text = text[:idx]
  237. chapter_text += text
  238. # Null characters are not allowed.
  239. chapter_text = chapter_text.replace('\0', '')
  240. # 限制章节内容长度
  241. if 0 < limit < len(chapter_text):
  242. split_text = smart_split_paragraph(chapter_text, limit)
  243. for text in split_text:
  244. chapters.append({"title": link_title, "content": text})
  245. else:
  246. # 保存章节信息
  247. chapters.append({"title": link_title, "content": chapter_text})
  248. # 目录中没有前言部分,手动处理
  249. if handle_pre_toc:
  250. pre_toc = []
  251. lines = page_content.strip().split('\n')
  252. try:
  253. for line in lines:
  254. if re.match(r'^前\s*言', line):
  255. pre_toc.append({'title': line, 'content': ''})
  256. else:
  257. pre_toc[-1]['content'] += line
  258. for i in range(len(pre_toc)):
  259. pre_toc[i]['content'] = re.sub(r'(?<!。)\n+', '', pre_toc[i]['content'])
  260. pre_toc[i]['content'] = re.sub(r'(?<!.)\n+', '', pre_toc[i]['content'])
  261. except BaseException as e:
  262. maxkb_logger.error(_('This document has no preface and is treated as ordinary text: {e}').format(e=e))
  263. if pattern_list is not None and len(pattern_list) > 0:
  264. split_model = SplitModel(pattern_list, with_filter, limit)
  265. else:
  266. split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
  267. # 插入目录前的部分
  268. page_content = re.sub(r'(?<!。)\n+', '', page_content)
  269. page_content = re.sub(r'(?<!.)\n+', '', page_content)
  270. page_content = page_content.strip()
  271. pre_toc = split_model.parse(page_content)
  272. chapters = pre_toc + chapters
  273. return chapters
  274. @staticmethod
  275. def handle_chapter_title(title):
  276. title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
  277. title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
  278. return title
  279. def support(self, file, get_buffer):
  280. file_name: str = file.name.lower()
  281. if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
  282. return True
  283. return False
  284. def get_content(self, file, save_image):
  285. with tempfile.NamedTemporaryFile(delete=False) as temp_file:
  286. # 将上传的文件保存到临时文件中
  287. temp_file.write(file.read())
  288. # 获取临时文件的路径
  289. temp_file_path = temp_file.name
  290. pdf_document = fitz.open(temp_file_path)
  291. try:
  292. return self.handle_pdf_content(file, pdf_document)
  293. except BaseException as e:
  294. traceback.print_exception(e)
  295. return f'{e}'