|
@@ -0,0 +1,222 @@
|
|
|
|
|
+"""
|
|
|
|
|
+将文件夹下所有MD文档中的HTML表格转换为Markdown表格格式
|
|
|
|
|
+在原文件上直接修改
|
|
|
|
|
+"""
|
|
|
|
|
+import re
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from typing import List
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# HTML表格转换相关正则
|
|
|
|
|
+HTML_TABLE_RE = re.compile(r"<table[^>]*>.*?</table>", re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+HTML_TR_RE = re.compile(r"<tr[^>]*>(.*?)</tr>", re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+HTML_TD_RE = re.compile(r"<td[^>]*>(.*?)</td>", re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+HTML_TH_RE = re.compile(r"<th[^>]*>(.*?)</th>", re.DOTALL | re.IGNORECASE)
|
|
|
|
|
+HTML_ROWSPAN_RE = re.compile(r'rowspan=["\']?(\d+)["\']?', re.IGNORECASE)
|
|
|
|
|
+HTML_COLSPAN_RE = re.compile(r'colspan=["\']?(\d+)["\']?', re.IGNORECASE)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def parse_html_table(table_html: str) -> List[List[str]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 解析HTML表格,返回二维列表(行 x 列)
|
|
|
|
|
+ 处理 rowspan 和 colspan,将跨行/跨列单元格展开为重复内容
|
|
|
|
|
+ (Markdown表格本身不支持rowspan/colspan,通过重复内容实现)
|
|
|
|
|
+ """
|
|
|
|
|
+ rows = []
|
|
|
|
|
+ rowspan_map = {} # 记录跨行信息: {(row, col): value}
|
|
|
|
|
+ current_row = 0
|
|
|
|
|
+
|
|
|
|
|
+ for tr_match in HTML_TR_RE.finditer(table_html):
|
|
|
|
|
+ tr_content = tr_match.group(1)
|
|
|
|
|
+ row = []
|
|
|
|
|
+ col_idx = 0
|
|
|
|
|
+
|
|
|
|
|
+ # 处理 td 和 th
|
|
|
|
|
+ cells = list(HTML_TD_RE.finditer(tr_content)) + list(HTML_TH_RE.finditer(tr_content))
|
|
|
|
|
+ cells.sort(key=lambda m: m.start()) # 按位置排序
|
|
|
|
|
+
|
|
|
|
|
+ for cell_match in cells:
|
|
|
|
|
+ cell_html = cell_match.group(0)
|
|
|
|
|
+ cell_content = cell_match.group(1).strip()
|
|
|
|
|
+ # 去除内部HTML标签
|
|
|
|
|
+ cell_content = re.sub(r'<[^>]+>', '', cell_content).strip()
|
|
|
|
|
+
|
|
|
|
|
+ # 解析 rowspan 和 colspan
|
|
|
|
|
+ rowspan_match = HTML_ROWSPAN_RE.search(cell_html)
|
|
|
|
|
+ colspan_match = HTML_COLSPAN_RE.search(cell_html)
|
|
|
|
|
+
|
|
|
|
|
+ rowspan = int(rowspan_match.group(1)) if rowspan_match else 1
|
|
|
|
|
+ colspan = int(colspan_match.group(1)) if colspan_match else 1
|
|
|
|
|
+
|
|
|
|
|
+ # 跳过被 rowspan 占用的位置
|
|
|
|
|
+ while (current_row, col_idx) in rowspan_map:
|
|
|
|
|
+ row.append(rowspan_map[(current_row, col_idx)])
|
|
|
|
|
+ col_idx += 1
|
|
|
|
|
+
|
|
|
|
|
+ # 添加当前单元格内容(重复 colspan 次)
|
|
|
|
|
+ for _ in range(colspan):
|
|
|
|
|
+ row.append(cell_content)
|
|
|
|
|
+
|
|
|
|
|
+ # 记录 rowspan 信息(用于后续行填充)
|
|
|
|
|
+ if rowspan > 1:
|
|
|
|
|
+ for r in range(1, rowspan):
|
|
|
|
|
+ key = (current_row + r, col_idx)
|
|
|
|
|
+ rowspan_map[key] = cell_content
|
|
|
|
|
+
|
|
|
|
|
+ col_idx += colspan
|
|
|
|
|
+
|
|
|
|
|
+ # 填充该行剩余的被 rowspan 占用的位置
|
|
|
|
|
+ while (current_row, col_idx) in rowspan_map:
|
|
|
|
|
+ row.append(rowspan_map[(current_row, col_idx)])
|
|
|
|
|
+ col_idx += 1
|
|
|
|
|
+
|
|
|
|
|
+ if row:
|
|
|
|
|
+ rows.append(row)
|
|
|
|
|
+ current_row += 1
|
|
|
|
|
+
|
|
|
|
|
+ return rows
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def convert_html_table_to_md(table_html: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 将HTML表格转换为Markdown表格格式
|
|
|
|
|
+ """
|
|
|
|
|
+ rows = parse_html_table(table_html)
|
|
|
|
|
+ if not rows:
|
|
|
|
|
+ return table_html
|
|
|
|
|
+
|
|
|
|
|
+ # 转换为Markdown表格
|
|
|
|
|
+ md_lines = []
|
|
|
|
|
+
|
|
|
|
|
+ for i, row in enumerate(rows):
|
|
|
|
|
+ # 转义管道符
|
|
|
|
|
+ escaped_row = [cell.replace('|', '\\|') for cell in row]
|
|
|
|
|
+ md_line = '| ' + ' | '.join(escaped_row) + ' |'
|
|
|
|
|
+ md_lines.append(md_line)
|
|
|
|
|
+
|
|
|
|
|
+ # 在第一行后添加分隔符
|
|
|
|
|
+ if i == 0:
|
|
|
|
|
+ separator = '|' + '|'.join(['---'] * len(row)) + '|'
|
|
|
|
|
+ md_lines.append(separator)
|
|
|
|
|
+
|
|
|
|
|
+ return '\n'.join(md_lines)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def convert_all_html_tables_to_md(text: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ 将文本中所有的HTML表格转换为Markdown表格
|
|
|
|
|
+ """
|
|
|
|
|
+ def replace_table(match):
|
|
|
|
|
+ table_html = match.group(0)
|
|
|
|
|
+ return convert_html_table_to_md(table_html)
|
|
|
|
|
+
|
|
|
|
|
+ return HTML_TABLE_RE.sub(replace_table, text)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def process_md_file(file_path: Path) -> bool:
|
|
|
|
|
+ """
|
|
|
|
|
+ 处理单个MD文件,将HTML表格转换为Markdown表格
|
|
|
|
|
+ 返回是否进行了转换
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ content = f.read()
|
|
|
|
|
+
|
|
|
|
|
+ # 检查是否包含HTML表格
|
|
|
|
|
+ if not HTML_TABLE_RE.search(content):
|
|
|
|
|
+ print(f" [跳过] {file_path.name} (无HTML表格)")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # 转换HTML表格
|
|
|
|
|
+ new_content = convert_all_html_tables_to_md(content)
|
|
|
|
|
+
|
|
|
|
|
+ # 写回原文件
|
|
|
|
|
+ with open(file_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
+ f.write(new_content)
|
|
|
|
|
+
|
|
|
|
|
+ # 统计转换的表格数量
|
|
|
|
|
+ table_count = len(HTML_TABLE_RE.findall(content))
|
|
|
|
|
+ print(f" [已转换] {file_path.name} ({table_count}个表格)")
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" [错误] {file_path.name} - {str(e)}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def process_folder(folder_path: str | Path) -> dict:
|
|
|
|
|
+ """
|
|
|
|
|
+ 处理文件夹下所有MD文件
|
|
|
|
|
+ """
|
|
|
|
|
+ folder_path = Path(folder_path)
|
|
|
|
|
+ if not folder_path.is_dir():
|
|
|
|
|
+ raise NotADirectoryError(f"不是有效的文件夹: {folder_path}")
|
|
|
|
|
+
|
|
|
|
|
+ results = {
|
|
|
|
|
+ "processed": 0,
|
|
|
|
|
+ "skipped": 0,
|
|
|
|
|
+ "failed": 0,
|
|
|
|
|
+ "total_tables": 0
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n[处理文件夹] {folder_path}")
|
|
|
|
|
+ print("=" * 60)
|
|
|
|
|
+
|
|
|
|
|
+ # 获取所有MD文件
|
|
|
|
|
+ md_files = list(folder_path.rglob("*.md"))
|
|
|
|
|
+
|
|
|
|
|
+ if not md_files:
|
|
|
|
|
+ print("[警告] 未找到MD文件")
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
|
|
+ print(f"[找到] {len(md_files)} 个MD文件\n")
|
|
|
|
|
+
|
|
|
|
|
+ for md_file in md_files:
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 统计原始表格数量
|
|
|
|
|
+ with open(md_file, 'r', encoding='utf-8') as f:
|
|
|
|
|
+ content = f.read()
|
|
|
|
|
+ table_count = len(HTML_TABLE_RE.findall(content))
|
|
|
|
|
+
|
|
|
|
|
+ if process_md_file(md_file):
|
|
|
|
|
+ results["processed"] += 1
|
|
|
|
|
+ results["total_tables"] += table_count
|
|
|
|
|
+ else:
|
|
|
|
|
+ results["skipped"] += 1
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" ❌ 错误: {md_file.name} - {str(e)}")
|
|
|
|
|
+ results["failed"] += 1
|
|
|
|
|
+
|
|
|
|
|
+ print("\n" + "=" * 60)
|
|
|
|
|
+ print("[处理结果]")
|
|
|
|
|
+ print(f" [已转换] {results['processed']} 个文件")
|
|
|
|
|
+ print(f" [跳过] {results['skipped']} 个文件")
|
|
|
|
|
+ print(f" [失败] {results['failed']} 个文件")
|
|
|
|
|
+ print(f" [共转换] {results['total_tables']} 个表格")
|
|
|
|
|
+ print("=" * 60)
|
|
|
|
|
+
|
|
|
|
|
+ return results
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def main():
|
|
|
|
|
+ """
|
|
|
|
|
+ 主函数 - 可以通过命令行参数或修改此处指定文件夹路径
|
|
|
|
|
+ """
|
|
|
|
|
+ import sys
|
|
|
|
|
+
|
|
|
|
|
+ # 可以通过命令行参数指定文件夹路径
|
|
|
|
|
+ if len(sys.argv) > 1:
|
|
|
|
|
+ folder_path = sys.argv[1]
|
|
|
|
|
+ else:
|
|
|
|
|
+ # 默认文件夹路径,请修改为你需要处理的文件夹路径
|
|
|
|
|
+ folder_path = r"F:\第二阶段编制依据及施工方案数据治理-20260206\最终编制依据"
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ process_folder(folder_path)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"\n[错误] {str(e)}")
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ main()
|