安装
pip install sopdf
1. 打开文档并渲染页面
import sopdf
with sopdf.open("document.pdf") as doc:
img_bytes = doc[0].render(dpi=150) # 返回 PNG 字节
doc[0].render_to_file("page-1.png", dpi=300)
2. 批量并行渲染
import sopdf
with sopdf.open("document.pdf") as doc:
images = sopdf.render_pages(doc.pages, dpi=150, parallel=True)
print(f"rendered: {len(images)} pages")
3. 提取文本与坐标块
import sopdf
with sopdf.open("document.pdf") as doc:
text = doc[0].get_text()
blocks = doc[0].get_text_blocks() # list[TextBlock],包含边界框
print(text[:200])
print(len(blocks))
4. 关键词搜索
import sopdf
with sopdf.open("document.pdf") as doc:
hits = doc[0].search("invoice", match_case=False) # list[Rect]
print(f"hit count: {len(hits)}")
5. 拆分与合并 PDF
import sopdf
with sopdf.open("source.pdf") as doc:
chapter = doc.split(pages=[0, 1, 2], output="chapter-1.pdf")
doc.split_each(output_dir="pages")
sopdf.merge(["intro.pdf", "body.pdf"], output="book.pdf")
6. 保存压缩与内存导出
import sopdf
with sopdf.open("source.pdf") as doc:
doc.save("output.pdf", compress=True, garbage=True)
raw_bytes = doc.to_bytes() # 不落盘,直接得到 bytes
print(len(raw_bytes))
7. 页面旋转
import sopdf
with sopdf.open("document.pdf") as doc:
doc[0].rotation = 90
doc.save("rotated.pdf")
8. 从 bytes / stream 打开
import sopdf
with open("document.pdf", "rb") as f:
data = f.read()
with sopdf.open(stream=data) as doc:
print(doc.page_count)
9. 元数据与目录(TOC)
import sopdf
with sopdf.open("document.pdf") as doc:
print(doc.metadata.title)
doc.metadata.title = "Updated Title"
print(doc.metadata.creation_datetime)
for item in doc.outline.items:
print(f"[p{item.page + 1}] {item.title}")
flat = doc.outline.to_list() # PyMuPDF 兼容的扁平目录结构
print(len(flat))
10. 加密与修复
import sopdf
with sopdf.open("protected.pdf", password="hunter2") as doc:
doc.save("unlocked.pdf")
with sopdf.open("corrupted.pdf") as doc:
doc.save("repaired.pdf")
以上示例基于 SoPDF README 和
examples/ 目录整理。完整能力与最新 API 请以仓库文档为准:
SoMarkAI/SoPDF
