文档转换docx后合并
import os
import sys
import time
import shutil
import subprocess
from pathlib import Path
# ========== 自动安装依赖 ==========
def install(package_name, import_name=None):
try:
__import__(import_name or package_name)
except ImportError:
print(f"⚙️ 正在安装依赖: {package_name} ...")
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
install("python-docx", "docx")
install("docxcompose", "docxcompose")
try:
__import__("win32com.client")
except ImportError:
try:
install("pywin32", "win32com")
except Exception:
pass
from docx import Document
from docxcompose.composer import Composer
# ========== 配置 ==========
input_dir = r"D:\doc" # 待合并目录(含 .doc / .docx)
output_file = r"D:\merged.docx" # 输出(必须 .docx)
# ========== 定位 soffice.exe ==========
def find_soffice_exe():
candidates = [
r"C:\Program Files\LibreOffice\program\soffice.exe",
r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
]
# 也支持便携版/自定义安装:在常见盘符搜一层
for drive in ["C:", "D:", "E:"]:
p = Path(drive + r"\LibreOffice\program\soffice.exe")
if p.exists():
candidates.insert(0, str(p))
for p in candidates:
if os.path.exists(p):
return p
# PATH 中查找
w = shutil.which("soffice")
return w
def has_word():
try:
import win32com.client # noqa
return True
except Exception:
return False
# ========== 转换函数 ==========
def convert_doc_to_docx_with_soffice(soffice_path, doc_path):
outdir = os.path.dirname(doc_path)
print(f"🔄 LibreOffice 转换: {doc_path}")
try:
subprocess.run(
[soffice_path, "--headless", "--convert-to", "docx", "--outdir", outdir, doc_path],
check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
)
new_path = doc_path + "x" # xxx.doc -> xxx.docx
for _ in range(30):
if os.path.exists(new_path):
break
time.sleep(0.1)
if os.path.exists(new_path):
print(f"✅ 转换成功: {new_path}")
return new_path
print(f"❌ 转换失败: 未找到 {new_path}")
except subprocess.CalledProcessError as e:
print("❌ LibreOffice 转换出错:")
print(e.stdout or str(e))
return None
def convert_doc_to_docx_with_word(doc_path):
print(f"🔄 Word 转换: {doc_path}")
try:
import win32com.client as win32
word = win32.gencache.EnsureDispatch("Word.Application")
word.Visible = False
doc = None
new_path = doc_path + "x"
try:
doc = word.Documents.Open(doc_path)
doc.SaveAs(new_path, FileFormat=16) # 16 = wdFormatXMLDocument (.docx)
finally:
if doc is not None:
doc.Close(False)
word.Quit()
if os.path.exists(new_path):
print(f"✅ 转换成功: {new_path}")
return new_path
else:
print(f"❌ 转换失败: 未找到 {new_path}")
except Exception as e:
print(f"❌ Word 转换出错: {e}")
return None
def convert_doc_to_docx(doc_path):
soffice = find_soffice_exe()
if soffice and os.path.exists(soffice):
p = convert_doc_to_docx_with_soffice(soffice, doc_path)
if p:
return p
if has_word():
p = convert_doc_to_docx_with_word(doc_path)
if p:
return p
print(f"⚠️ 无法转换(缺少 LibreOffice 或 Word):{doc_path}")
return None
# ========== 合并 ==========
def merge_docs(input_dir, output_file):
if not output_file.lower().endswith(".docx"):
base, _ = os.path.splitext(output_file)
output_file = base + ".docx"
print(f"ℹ️ 输出强制为 .docx:{output_file}")
names = [f for f in os.listdir(input_dir)
if f.lower().endswith((".doc", ".docx")) and not f.startswith("~$")]
names.sort()
if not names:
print("⚠️ 目录中没有 .doc / .docx 文件。")
return
# 统一准备成 .docx 列表
docx_paths = []
for name in names:
p = os.path.join(input_dir, name)
if name.lower().endswith(".docx"):
docx_paths.append(p)
else:
newp = convert_doc_to_docx(p)
if newp:
docx_paths.append(newp)
docx_paths = [p for p in docx_paths if os.path.exists(p)]
if not docx_paths:
print("⚠️ 没有可合并的 .docx(请先安装 LibreOffice 或 Word)。")
return
print(f"🧩 母版:{os.path.basename(docx_paths[0])}")
master = Document(docx_paths[0])
composer = Composer(master)
for i, p in enumerate(docx_paths[1:], start=2):
print(f"📄 追加第 {i} 个:{os.path.basename(p)}")
doc = Document(p)
if i <= len(docx_paths):
doc.add_page_break()
composer.append(doc)
abs_inputs = set(map(os.path.abspath, docx_paths))
out = output_file
if os.path.abspath(out) in abs_inputs:
out = os.path.join(os.path.dirname(out), "_merged_output.docx")
print(f"ℹ️ 输出与输入冲突,改为:{out}")
composer.save(out)
print(f"\n✅ 合并完成:{out}")
if __name__ == "__main__":
merge_docs(input_dir, output_file)
❤️ 转载文章请注明出处,谢谢!❤️