Преглед изворни кода

feat:网页table转md表格

ai02 пре 4 недеља
родитељ
комит
e86da1f35c
1 измењених фајлова са 222 додато и 0 уклоњено
  1. 222 0
      src/app/scripts/html_tables_to_md.py

+ 222 - 0
src/app/scripts/html_tables_to_md.py

@@ -0,0 +1,222 @@
+"""
+将文件夹下所有MD文档中的HTML表格转换为Markdown表格格式
+在原文件上直接修改
+"""
+import re
+from pathlib import Path
+from typing import List
+
+
+# HTML表格转换相关正则
+HTML_TABLE_RE = re.compile(r"<table[^>]*>.*?</table>", re.DOTALL | re.IGNORECASE)
+HTML_TR_RE = re.compile(r"<tr[^>]*>(.*?)</tr>", re.DOTALL | re.IGNORECASE)
+HTML_TD_RE = re.compile(r"<td[^>]*>(.*?)</td>", re.DOTALL | re.IGNORECASE)
+HTML_TH_RE = re.compile(r"<th[^>]*>(.*?)</th>", re.DOTALL | re.IGNORECASE)
+HTML_ROWSPAN_RE = re.compile(r'rowspan=["\']?(\d+)["\']?', re.IGNORECASE)
+HTML_COLSPAN_RE = re.compile(r'colspan=["\']?(\d+)["\']?', re.IGNORECASE)
+
+
+def parse_html_table(table_html: str) -> List[List[str]]:
+    """
+    解析HTML表格,返回二维列表(行 x 列)
+    处理 rowspan 和 colspan,将跨行/跨列单元格展开为重复内容
+    (Markdown表格本身不支持rowspan/colspan,通过重复内容实现)
+    """
+    rows = []
+    rowspan_map = {}  # 记录跨行信息: {(row, col): value}
+    current_row = 0
+
+    for tr_match in HTML_TR_RE.finditer(table_html):
+        tr_content = tr_match.group(1)
+        row = []
+        col_idx = 0
+
+        # 处理 td 和 th
+        cells = list(HTML_TD_RE.finditer(tr_content)) + list(HTML_TH_RE.finditer(tr_content))
+        cells.sort(key=lambda m: m.start())  # 按位置排序
+
+        for cell_match in cells:
+            cell_html = cell_match.group(0)
+            cell_content = cell_match.group(1).strip()
+            # 去除内部HTML标签
+            cell_content = re.sub(r'<[^>]+>', '', cell_content).strip()
+
+            # 解析 rowspan 和 colspan
+            rowspan_match = HTML_ROWSPAN_RE.search(cell_html)
+            colspan_match = HTML_COLSPAN_RE.search(cell_html)
+
+            rowspan = int(rowspan_match.group(1)) if rowspan_match else 1
+            colspan = int(colspan_match.group(1)) if colspan_match else 1
+
+            # 跳过被 rowspan 占用的位置
+            while (current_row, col_idx) in rowspan_map:
+                row.append(rowspan_map[(current_row, col_idx)])
+                col_idx += 1
+
+            # 添加当前单元格内容(重复 colspan 次)
+            for _ in range(colspan):
+                row.append(cell_content)
+
+            # 记录 rowspan 信息(用于后续行填充)
+            if rowspan > 1:
+                for r in range(1, rowspan):
+                    key = (current_row + r, col_idx)
+                    rowspan_map[key] = cell_content
+
+            col_idx += colspan
+
+        # 填充该行剩余的被 rowspan 占用的位置
+        while (current_row, col_idx) in rowspan_map:
+            row.append(rowspan_map[(current_row, col_idx)])
+            col_idx += 1
+
+        if row:
+            rows.append(row)
+            current_row += 1
+
+    return rows
+
+
+def convert_html_table_to_md(table_html: str) -> str:
+    """
+    将HTML表格转换为Markdown表格格式
+    """
+    rows = parse_html_table(table_html)
+    if not rows:
+        return table_html
+
+    # 转换为Markdown表格
+    md_lines = []
+
+    for i, row in enumerate(rows):
+        # 转义管道符
+        escaped_row = [cell.replace('|', '\\|') for cell in row]
+        md_line = '| ' + ' | '.join(escaped_row) + ' |'
+        md_lines.append(md_line)
+
+        # 在第一行后添加分隔符
+        if i == 0:
+            separator = '|' + '|'.join(['---'] * len(row)) + '|'
+            md_lines.append(separator)
+
+    return '\n'.join(md_lines)
+
+
+def convert_all_html_tables_to_md(text: str) -> str:
+    """
+    将文本中所有的HTML表格转换为Markdown表格
+    """
+    def replace_table(match):
+        table_html = match.group(0)
+        return convert_html_table_to_md(table_html)
+
+    return HTML_TABLE_RE.sub(replace_table, text)
+
+
+def process_md_file(file_path: Path) -> bool:
+    """
+    处理单个MD文件,将HTML表格转换为Markdown表格
+    返回是否进行了转换
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+        # 检查是否包含HTML表格
+        if not HTML_TABLE_RE.search(content):
+            print(f"  [跳过] {file_path.name} (无HTML表格)")
+            return False
+
+        # 转换HTML表格
+        new_content = convert_all_html_tables_to_md(content)
+
+        # 写回原文件
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(new_content)
+
+        # 统计转换的表格数量
+        table_count = len(HTML_TABLE_RE.findall(content))
+        print(f"  [已转换] {file_path.name} ({table_count}个表格)")
+        return True
+
+    except Exception as e:
+        print(f"  [错误] {file_path.name} - {str(e)}")
+        return False
+
+
+def process_folder(folder_path: str | Path) -> dict:
+    """
+    处理文件夹下所有MD文件
+    """
+    folder_path = Path(folder_path)
+    if not folder_path.is_dir():
+        raise NotADirectoryError(f"不是有效的文件夹: {folder_path}")
+
+    results = {
+        "processed": 0,
+        "skipped": 0,
+        "failed": 0,
+        "total_tables": 0
+    }
+
+    print(f"\n[处理文件夹] {folder_path}")
+    print("=" * 60)
+
+    # 获取所有MD文件
+    md_files = list(folder_path.rglob("*.md"))
+
+    if not md_files:
+        print("[警告] 未找到MD文件")
+        return results
+
+    print(f"[找到] {len(md_files)} 个MD文件\n")
+
+    for md_file in md_files:
+        try:
+            # 统计原始表格数量
+            with open(md_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+            table_count = len(HTML_TABLE_RE.findall(content))
+
+            if process_md_file(md_file):
+                results["processed"] += 1
+                results["total_tables"] += table_count
+            else:
+                results["skipped"] += 1
+
+        except Exception as e:
+            print(f"  ❌ 错误: {md_file.name} - {str(e)}")
+            results["failed"] += 1
+
+    print("\n" + "=" * 60)
+    print("[处理结果]")
+    print(f"  [已转换] {results['processed']} 个文件")
+    print(f"  [跳过] {results['skipped']} 个文件")
+    print(f"  [失败] {results['failed']} 个文件")
+    print(f"  [共转换] {results['total_tables']} 个表格")
+    print("=" * 60)
+
+    return results
+
+
+def main():
+    """
+    主函数 - 可以通过命令行参数或修改此处指定文件夹路径
+    """
+    import sys
+
+    # 可以通过命令行参数指定文件夹路径
+    if len(sys.argv) > 1:
+        folder_path = sys.argv[1]
+    else:
+        # 默认文件夹路径,请修改为你需要处理的文件夹路径
+        folder_path = r"F:\第二阶段编制依据及施工方案数据治理-20260206\最终编制依据"
+
+    try:
+        process_folder(folder_path)
+    except Exception as e:
+        print(f"\n[错误] {str(e)}")
+
+
+if __name__ == "__main__":
+    main()