system-design/software-copyright/batch_md_to_pdf.py

#!/usr/bin/env python3
"""
批量将 Markdown 转为 PDF —— 通过 Google Chrome Headless 浏览器渲染
1. Markdown → HTML（含代码高亮、表格样式）
2. HTML → 写入临时文件 → Chrome --headless --print-to-pdf
"""
import glob
import os
import subprocess
import tempfile
import markdown
from markdown.extensions.tables import TableExtension
from markdown.extensions.fenced_code import FencedCodeExtension
from markdown.extensions.codehilite import CodeHiliteExtension
from markdown.extensions.toc import TocExtension

# ─── 配置 ───
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DIRS = [f"{i:02d}-*" for i in range(1, 14)]
PATTERNS = ["*-鉴别材料.md", "*-源程序.md"]
CHROME = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"

CSS = """
@page {
    size: A4;
    margin: 20mm 18mm 20mm 18mm;
}
body {
    font-family: "PingFang SC", "Microsoft YaHei", "STSong", "Noto Sans CJK SC", sans-serif;
    font-size: 11pt;
    line-height: 1.7;
    color: #222;
    max-width: 100%;
    padding: 0 10px;
    margin: 0;
}
h1 { font-size: 20pt; border-bottom: 2px solid #333; padding-bottom: 6px; margin-top: 24pt; }
h2 { font-size: 16pt; border-bottom: 1px solid #999; padding-bottom: 4px; margin-top: 20pt; }
h3 { font-size: 13pt; margin-top: 16pt; }
h4 { font-size: 12pt; margin-top: 12pt; }
table {
    border-collapse: collapse;
    width: 100%;
    margin: 12px 0;
    font-size: 10pt;
    page-break-inside: auto;
}
th, td {
    border: 1px solid #bbb;
    padding: 6px 10px;
    text-align: left;
}
th { background-color: #f0f0f0; font-weight: bold; }
tr:nth-child(even) { background-color: #fafafa; }
code {
    font-family: "SF Mono", "Menlo", "Consolas", "Monaco", monospace;
    font-size: 9.5pt;
    background: #f5f5f5;
    padding: 1px 4px;
    border-radius: 3px;
}
pre {
    background: #f6f8fa;
    border: 1px solid #e1e4e8;
    border-radius: 6px;
    padding: 12px 16px;
    overflow-x: auto;
    font-size: 8.5pt;
    line-height: 1.5;
    page-break-inside: auto;
    white-space: pre-wrap;
    word-wrap: break-word;
}
pre code {
    background: transparent;
    padding: 0;
    font-size: inherit;
}
blockquote {
    border-left: 4px solid #dfe2e5;
    padding: 4px 16px;
    margin: 12px 0;
    color: #555;
    background: #f9f9f9;
}
img { max-width: 100%; }
hr { border: none; border-top: 1px solid #ddd; margin: 20px 0; }
.codehilite { background: #f6f8fa; border-radius: 6px; padding: 12px 16px; }
"""

def md_to_html(md_path):
    """将 Markdown 文件转换为完整的 HTML 页面"""
    with open(md_path, 'r', encoding='utf-8') as f:
        md_text = f.read()

    extensions = [
        TableExtension(),
        FencedCodeExtension(),
        CodeHiliteExtension(css_class='codehilite', guess_lang=False),
        TocExtension(permalink=False),
    ]
    html_body = markdown.markdown(md_text, extensions=extensions)

    title = os.path.splitext(os.path.basename(md_path))[0]
    html = f"""<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<title>{title}</title>
<style>{CSS}</style>
</head>
<body>
{html_body}
</body>
</html>"""
    return html


def html_to_pdf_chrome(html_content, pdf_path):
    """使用 Google Chrome headless 将 HTML 转为 PDF"""
    with tempfile.NamedTemporaryFile(suffix='.html', mode='w', encoding='utf-8', delete=False) as f:
        f.write(html_content)
        html_path = f.name

    try:
        cmd = [
            CHROME,
            '--headless',
            '--disable-gpu',
            '--no-sandbox',
            '--disable-software-rasterizer',
            '--run-all-compositor-stages-before-draw',
            f'--print-to-pdf={pdf_path}',
            '--no-pdf-header-footer',
            html_path,
        ]
        proc = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
        return os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0
    except Exception as e:
        print(f"  Chrome 错误: {e}")
        return False
    finally:
        os.unlink(html_path)


def main():
    # 收集所有目标文件
    md_files = []
    for dir_pattern in DIRS:
        for pattern in PATTERNS:
            found = sorted(glob.glob(os.path.join(BASE_DIR, dir_pattern, pattern)))
            md_files.extend(found)

    if not md_files:
        print("❌ 未找到目标 md 文件")
        return

    print(f"找到 {len(md_files)} 个 Markdown 文件待转换")
    print(f"使用 Chrome Headless 浏览器渲染\n")

    success = 0
    failed = 0
    for i, md_path in enumerate(md_files, 1):
        rel = os.path.relpath(md_path, BASE_DIR)
        pdf_name = os.path.splitext(os.path.basename(md_path))[0] + '.pdf'
        pdf_path = os.path.join(os.path.dirname(md_path), pdf_name)

        print(f"[{i:2d}/{len(md_files)}] {rel}")
        try:
            html = md_to_html(md_path)
            if html_to_pdf_chrome(html, pdf_path):
                size_kb = os.path.getsize(pdf_path) / 1024
                print(f"       ✅ → {pdf_name} ({size_kb:.0f} KB)")
                success += 1
            else:
                print(f"       ❌ PDF 生成失败")
                failed += 1
        except Exception as e:
            print(f"       ❌ 错误: {e}")
            failed += 1

    print(f"\n{'='*50}")
    print(f"转换完成: {success} 成功, {failed} 失败, 共 {len(md_files)} 个文件")


if __name__ == '__main__':
    main()