pdf/html转docx

read_all_file

def read_file_all(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            return content
    except FileNotFoundError:
        print(f"文件 {file_path} 不存在")
        return None
    except Exception as e:
        print(f"读取文件时出错: {e}")
        return None

win32com

import win32com.client

def try_pdf_via_word(pdf_path, docx_path):
    # 创建Word应用对象
    word = win32com.client.Dispatch("Word.Application")
    # 设置为不可见，避免Word界面弹出
    word.Visible = False
    try:
        # 尝试用Word打开PDF
        doc = word.Documents.Open(pdf_path)
        # 另存为DOCX格式，FileFormat=16 表示docx格式[citation:3]
        doc.SaveAs(docx_path, 16)
        doc.Close()
        print("转换完成（效果因PDF而异）")
    except Exception as e:
        print(f"转换失败: {e}")
    finally:
        word.Quit()

# 使用示例
try_pdf_via_word(r'C:\path\to\your\input.pdf', r'C:\path\to\your\output.docx')

Aspose.words

import aspose.words as aw
doc = aw.Document("Input.pdf")
doc.save("Output.docx")

win32com-2

import win32com.client
import os

def pdf_to_word_using_word(pdf_path, docx_path=None):
    """
    使用 Microsoft Word 打开 PDF 并另存为 Word 文档
    
    注意：此功能需要 Microsoft Word 2013 或更高版本
    """
    # 如果没有指定输出路径，使用相同的文件名但扩展名为.docx
    if docx_path is None:
        docx_path = pdf_path.replace('.pdf', '.docx')
    
    try:
        # 创建 Word 应用程序实例
        word = win32com.client.Dispatch("Word.Application")
        # 在后台运行，不显示 Word 界面
        word.Visible = False
        
        # 打开 PDF 文件
        doc = word.Documents.Open(pdf_path)
        
        # 另存为 Word 文档
        doc.SaveAs2(docx_path, FileFormat=16)  # 16 代表 wdFormatDocumentDefault
        
        # 关闭文档和 Word 应用程序
        doc.Close()
        word.Quit()
        
        print(f"转换成功: {pdf_path} -> {docx_path}")
        return True
        
    except Exception as e:
        print(f"转换失败: {str(e)}")
        return False

# 使用示例
pdf_to_word_using_word("C:/path/to/your/document.pdf")

Aspose.pdf

import aspose.pdf as apdf
document = apdf.Document("Input.pdf")
save_options = apdf.DocSaveOptions()
save_options.format = apdf.DocSaveOptions.DocFormat.DOC_X
document.save("Output.docx", save_options)

pdf2docx

from pdf2docx import Converter

pdf_file = 'input.pdf'
docx_file = 'output.docx'

# 创建转换器对象
cv = Converter(pdf_file)
# 开始转换，0表示起始页，None表示直到最后一页
cv.convert(docx_file, start=0, end=None)
# 关闭转换器释放资源
cv.close()

docx4j

<dependencies>
    <dependency>
        <groupId>org.docx4j</groupId>
        <artifactId>docx4j-JAXB-Internal</artifactId> <!-- 或其他适合的 docx4j  artifactId -->
        <version>8.3.2</version> <!-- 建议使用最新稳定版 -->
    </dependency>
    <!-- 如果使用 ImportXHTML 方式，可能需要额外添加 html-to-docx 相关依赖 -->
    <dependency>
        <groupId>org.docx4j.convert.in.html</groupId>
        <artifactId>html-to-docx</artifactId>
        <version>8.3.1</version>
    </dependency>
</dependencies>

docx4j-importXHTML

import org.docx4j.Docx4J;
import org.docx4j.convert.in.xhtml.ImportFromHtml;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.io.SaveToZipFile;
import java.io.File;
import java.io.IOException;

public class HtmlToDocxConverter {

    public static void convertHtmlToDocx(String inputHtmlPath, String outputDocxPath) {
        try {
            // 创建 Word 处理包
            WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
            
            // 使用 ImportFromHtml 类进行转换 [citation:4][citation:5]
            ImportFromHtml importHtml = new ImportFromHtml(wordMLPackage);
            // 第二个参数可用于指定外部 CSS 样式文件，如果 HTML 使用了外部样式，可传入 File 对象
            importHtml.convert(new File(inputHtmlPath), null); 

            // 保存 DOCX 文件
            SaveToZipFile saver = new SaveToZipFile(wordMLPackage);
            saver.save(outputDocxPath);
            
            System.out.println("转换成功！输出文件: " + outputDocxPath);
            
        } catch (IOException | Docx4JException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        String htmlFilePath = "path/to/your/input.html"; // 替换为您的 HTML 文件路径
        String docxFilePath = "path/to/your/output.docx"; // 替换为期望输出的 DOCX 文件路径
        convertHtmlToDocx(htmlFilePath, docxFilePath);
    }
}

docx4j-addAltChunk

import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.docx4j.openpackaging.parts.WordprocessingML.AltChunkType;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;

public class HtmlToDocxAltChunk {

    public static void convertHtmlToDocxAltChunk(String htmlContent, String outputDocxPath) {
        try {
            // 使用 Jsoup 标准化 HTML 内容 [citation:8]
            Document document = Jsoup.parse(htmlContent);
            document.head().prepend("<meta charset=\"utf-8\"/>"); // 确保字符集设置 [citation:8]
            String normalizedHtmlContent = document.html();

            // 创建或加载一个 Word 处理包
            WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.createPackage();
            // 获取主文档部分并添加 AltChunk [citation:8]
            wordMLPackage.getMainDocumentPart().addAltChunk(AltChunkType.Html, normalizedHtmlContent.getBytes(StandardCharsets.UTF_8));

            // 保存文档
            wordMLPackage.save(new File(outputDocxPath));
            
            System.out.println("使用 AltChunk 转换成功！输出文件: " + outputDocxPath);
            
        } catch (Docx4JException | IOException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        String htmlContent = "<!DOCTYPE html><html><head><title>示例</title></head><body><h1>这是一个标题</h1><p>这是一个<strong>段落</strong>。</p></body></html>";
        String docxFilePath = "path/to/your/output_altchunk.docx";
        convertHtmlToDocxAltChunk(htmlContent, docxFilePath);
    }
}

json+poi

<!-- Apache POI for DOCX -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>5.2.3</version> <!-- 建议使用最新稳定版 -->
</dependency>
<!-- Jsoup for HTML parsing -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.17.1</version> <!-- 建议使用最新稳定版 -->
</dependency>

import org.apache.poi.xwpf.usermodel.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.FileOutputStream;
import java.io.IOException;

public class HtmlToWord {
    public static void main(String[] args) throws Exception {
        String htmlString = "<html><body><h1>这是一个主标题</h1><p>这是一个段落。</p></body></html>";
        convertHtmlToWord(htmlString, "output.docx");
    }

    public static void convertHtmlToWord(String htmlString, String outputFilePath) throws IOException {
        // 使用Jsoup解析HTML字符串
        Document htmlDocument = Jsoup.parse(htmlString);
        // 创建一个空的Word文档
        XWPFDocument document = new XWPFDocument();

        // 遍历HTML body内的直接子元素
        Elements elements = htmlDocument.body().children();
        for (Element element : elements) {
            switch (element.tagName()) {
                case "h1":
                    // 处理一级标题
                    XWPFParagraph titleParagraph = document.createParagraph();
                    XWPFRun titleRun = titleParagraph.createRun();
                    titleRun.setText(element.text());
                    titleRun.setBold(true); // 设置加粗
                    titleRun.setFontSize(20); // 设置字体大小
                    break;
                case "p":
                    // 处理段落
                    XWPFParagraph paragraph = document.createParagraph();
                    XWPFRun run = paragraph.createRun();
                    run.setText(element.text());
                    break;
                // 可以在这里添加更多HTML标签的处理逻辑，例如h2, li等
            }
        }

        // 将Word文档保存到文件
        try (FileOutputStream out = new FileOutputStream(outputFilePath)) {
            document.write(out);
        }
    }
}

htmldocx

# Python使用html2docx
from htmldocx import HtmlToDocx

def html_to_word(html_content, output_path):
    parser = HtmlToDocx()
    doc = parser.parse_html_string(html_content)
    doc.save(output_path)

# 示例HTML (可以直接从UI生成)
html_ui = """
<div style="font-family: Arial; color: #333;">
    <h1 style="color: #ff0000; text-align: center;">UI标题</h1>
    <table border="1" style="width: 100%;">
        <tr>
            <td>单元格1</td>
            <td>单元格2</td>
        </tr>
    </table>
</div>
"""

pdf/html转docx

read_all_file

win32com

Aspose.words

win32com-2

Aspose.pdf

pdf2docx

docx4j

docx4j-importXHTML

docx4j-addAltChunk

json+poi

htmldocx

chadLi

引用和评论

linux安装comfyui

PyCharm 2026年4月新版本 2026.1 更新内容，安装激活使用教程

Python3 格式化时间（qbit）

让 Claude Code 拥有自我进化和记忆系统｜得物技术

【开源剪映小助手】开发者指南

微信电脑版4.x版本聊天记录监听脚本（带ui界面）

Python 项目管理由 poetry 切换到 uv（qbit）