Certificates/hk-bank/batch_to_pdf.py

#!/usr/bin/env python3
"""
批量将 hk-bank 目录及子目录下的 *.md 文件转换为 PDF
- 含 PlantUML 图表的文件使用 lua-filter 在线渲染
- 含 emoji 的文件先替换为纯文本符号
"""
import os
import re
import subprocess
import shutil

BASE_DIR = "/Users/jiahong/Documents/Job/docs/Certificates/hk-bank"
LUA_FILTER = "/Users/jiahong/Documents/Job/docs/.vscode/plantuml-filter.lua"
PANDOC_BASE = [
    "pandoc",
    "--pdf-engine=xelatex",
    "-V", "mainfont=STSong",
    "-V", "monofont=STFangsong",
    "-V", "geometry:margin=2.5cm",
    "-V", "CJKmainfont=STSong",
    "--lua-filter", LUA_FILTER,
]

EMOJI_MAP = {
    "✅": "[OK]",
    "❌": "[X]",
    "⬜": "[ ]",
    "⬛": "[#]",
    "📄": "[doc]",
    "⚠️": "[!]",
    "🔍": "[search]",
    "📜": "[scroll]",
    "📝": "[note]",
}

def replace_emoji(text):
    for emoji, replacement in EMOJI_MAP.items():
        text = text.replace(emoji, replacement)
    # 通用 emoji 清除（保留中英文、标点、符号）
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
    return text

def has_plantuml(content):
    return '@startuml' in content or '```plantuml' in content

def convert_md_to_pdf(md_path):
    with open(md_path, 'r', encoding='utf-8') as f:
        content = f.read()

    pdf_path = os.path.splitext(md_path)[0] + ".pdf"
    need_clean = False

    # 处理 emoji
    cleaned = replace_emoji(content)
    if cleaned != content:
        tmp_md = md_path + ".tmp.md"
        with open(tmp_md, 'w', encoding='utf-8') as f:
            f.write(cleaned)
        src = tmp_md
        need_clean = True
    else:
        src = md_path

    cmd = PANDOC_BASE + [src, "-o", pdf_path]

    print(f"  转换: {os.path.relpath(md_path, BASE_DIR)}")
    result = subprocess.run(cmd, capture_output=True, text=True)

    if need_clean and os.path.exists(src):
        os.remove(src)

    if os.path.exists(pdf_path):
        print(f"  ✓ 生成: {os.path.relpath(pdf_path, BASE_DIR)}")
        return True
    else:
        print(f"  ✗ 失败: {os.path.relpath(md_path, BASE_DIR)}")
        if result.stderr:
            print(f"    错误: {result.stderr[:300]}")
        return False

def main():
    md_files = []
    for root, dirs, files in os.walk(BASE_DIR):
        # 跳过隐藏目录
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for f in sorted(files):
            if f.endswith('.md'):
                md_files.append(os.path.join(root, f))

    print(f"共发现 {len(md_files)} 个 Markdown 文件\n")
    ok, fail = 0, 0
    for md in md_files:
        success = convert_md_to_pdf(md)
        if success:
            ok += 1
        else:
            fail += 1

    print(f"\n完成：{ok} 个成功，{fail} 个失败")

if __name__ == "__main__":
    main()