python pdf 转 md 文档 多个方案
推荐
常规pdf 推荐方案2,方案1
pdf中 图片多 扫描件 推荐ocr 方案3
有大模型基础和资源 推荐 方案4
方案1 pdf转word转html转md
解析较慢 链路较多 效果和 方案2类似 但不如方案2快
import time
import mammoth
import markdownify
from pdf2docx import Converter
# 参考 https://zhuanlan.zhihu.com/p/688159597 https://cloud.tencent.com/developer/article/1768983
# 转存Word文档内的图片
def convert_images(image):
with image.open() as image_bytes:
file_suffix = image.content_type.split("/")[1]
path_file = base_path+"{}.{}".format(str(time.time()),file_suffix)
with open(path_file, 'wb') as f:
f.write(image_bytes.read())
return {"src":path_file}
def pdf2md(pdf_path):
start = time.time()
word_path = pdf_path+'.docx'
html_path = pdf_path+'.docx.html'
md_path = pdf_path+'.docx.html.md'
# 创建对象
cv = Converter(pdf_path)
# 实现 PDF 转换为 Word
cv.convert(word_path, start=0, end=None)
# 关闭对象
cv.close()
# 转化Word文档为HTML
result = mammoth.convert_to_html(word_path,convert_image = mammoth.images.img_element(convert_images))
# 获取HTML内容
html = result.value
# 转化HTML为Markdown
md = markdownify.markdownify(html,heading_style="ATX")
with open(html_path,'w',encoding='utf-8') as html_file,open(md_path,"w",encoding='utf-8') as md_file:
html_file.write(html)
md_file.write(md)
elapse_time = time.time() - start
print('转换 '+pdf_path+' 运行时间:'+ str(elapse_time))
if __name__ == '__main__':
# pdf转doc转html转md
base_path = '/Users/xxxx/'
pdf2md(base_path+'xxxxx.pdf')
方案2 使用 pymupdf 的 pymupdf4llm
解析快 会存在字体缺失,乱码问题
import time
import pymupdf4llm
# 参考 https://pymupdf.readthedocs.io/en/latest/rag.html
# pdf
def pdf2md(pdf_path,md_path):
start = time.time()
md_text = pymupdf4llm.to_markdown(pdf_path, write_images=True)
output = open(md_path, "w")
output.write(md_text)
output.close()
elapse_time = time.time() - start
print('转换 '+pdf_path+' 运行时间:'+ str(elapse_time))
if __name__ == '__main__':
base_path = '/Users/xxxx/'
pdf2md(base_path+'xxxxx.pdf',base_path+'xxxx.pdf.md')
方案3 使用 pix2text 进行 ocr识别
pix2text 安装会很大,效果不错,解析时间会较长
import time
from pix2text import Pix2Text
# 主要使用ocr识别 参考 https://www.breezedeus.com/article/p2t-v1.1
# 类似项目 VikParuchuri/marker https://github.com/VikParuchuri/marker?tab=readme-ov-file
def pdf2md(base_path,pdf_path,md_name):
start = time.time()
p2t = Pix2Text.from_config(enable_formula=False)
doc = p2t.recognize_pdf(pdf_path)
doc.to_markdown(out_dir=base_path,markdown_fn=md_name)
elapse_time = time.time() - start
print('转换 '+pdf_path+' 运行时间:'+ str(elapse_time))
if __name__ == '__main__':
base_path = '/Users/xxx/'
pdf2md(base_path,base_path+'xxx.pdf','xxx.pdf.md')
方案4 使用 ai模型 gpt 问答
效果不可控,解析时间长,需要微调和适配
参考 https://mp.weixin.qq.com/s/P5KBcqXBqHqvrqnlMIkQYA
原文地址:https://blog.csdn.net/a1041646584/article/details/140722820
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!