常见资料文件转换为 TXT 文件
代码概述
该代码旨在遍历指定目录中的所有文件,将支持的文件格式(如PDF、DOC、DOCX、PPT、PPTX、XLS、XLSX、TXT)转换为文本文件并保存。使用不同的库处理不同的文件格式,如fitz
用于PDF,docx
用于DOCX,comtypes
用于PPT,openpyxl
用于XLSX,xlrd
用于XLS,aspose-words
用于DOC等。
安装依赖库
在运行代码之前,需要安装以下Python库:
注意:Python版本为3.9
pip install pymupdf python-docx chardet python-pptx xlrd openpyxl comtypes aspose-words
代码解释
1 导入必要的库
import os
import fitz # PyMuPDF
import docx
import chardet
from pptx import Presentation
import xlrd
import openpyxl
import comtypes.client
import aspose.words as aw
2 定义提取文本的函数
为每种文件格式定义一个函数来提取文本:
从PPT文件提取文本
def extract_text_from_ppt(file_path):
powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
powerpoint.Visible = 1
abs_file_path = os.path.abspath(file_path)
slides = powerpoint.Presentations.Open(abs_file_path).Slides
text_runs = []
for slide in slides:
for shape in slide.Shapes:
if shape.HasTextFrame and shape.TextFrame.TextRange.Paragraphs().Count > 0:
try:
for paragraph in shape.TextFrame.TextRange.Paragraphs():
for run in paragraph.Runs():
text_runs.append(run.Text)
except Exception as e:
print(f"Error processing shape in slide: {e}")
powerpoint.Quit()
return '\n'.join(text_runs)
从PDF文件提取文本
def extract_text_from_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
return text
从DOCX文件提取文本
def extract_text_from_docx(file_path):
doc = docx.Document(file_path)
text = [paragraph.text for paragraph in doc.paragraphs]
return "\n".join(text)
从PPTX文件提取文本
def extract_text_from_pptx(file_path):
prs = Presentation(file_path)
text_runs = []
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_runs.append(run.text)
return '\n'.join(text_runs)
从XLSX文件提取文本
def extract_text_from_xlsx(file_path):
workbook = openpyxl.load_workbook(file_path)
sheets = workbook.sheetnames
text = []
for sheet_name in sheets:
sheet = workbook[sheet_name]
for row in sheet.iter_rows(values_only=True):
text.append("\t".join([str(cell) if cell is not None else "" for cell in row]))
return "\n".join(text)
从XLS文件提取文本
def extract_text_from_xls(file_path):
workbook = xlrd.open_workbook(file_path)
text = []
for sheet in workbook.sheets():
for row_idx in range(sheet.nrows):
row = sheet.row(row_idx)
text.append("\t".join([str(cell.value) for cell in row]))
return "\n".join(text)
从DOC文件提取文本
def extract_text_from_doc(file_path):
try:
doc = aw.Document(file_path)
text = doc.get_text()
return text
except Exception as e:
print(f"Error processing file {file_path}: {e}")
return ""
从TXT文件提取文本
def extract_text_from_txt(file_path):
rawdata = open(file_path, 'rb').read()
result = chardet.detect(rawdata)
encoding = result['encoding']
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
return f.read()
3 文件扩展名与提取函数的映射
EXTRACTORS = {
'.pdf': extract_text_from_pdf,
'.docx': extract_text_from_docx,
'.pptx': extract_text_from_pptx,
'.ppt': extract_text_from_ppt,
'.xlsx': extract_text_from_xlsx,
'.xls': extract_text_from_xls,
'.doc': extract_text_from_doc,
'.txt': extract_text_from_txt,
}
4 生成输出文件路径的函数
def get_next_output_file_path(directory):
files = os.listdir(directory)
txt_files = [f for f in files if f.endswith('.txt')]
num_txt_files = len(txt_files)
num_str = str(num_txt_files).zfill(3)
output_file_path = os.path.join(directory, f'output_{num_str}.txt')
return output_file_path
5 文件转换函数
def convert_to_text(file_path: str) -> str:
_, ext = os.path.splitext(file_path)
extractor = EXTRACTORS.get(ext)
if extractor is None:
raise ValueError(f"Unsupported file format: {ext}")
return extractor(file_path)
6 遍历目录并转换文件的函数
def convert_files_in_directory(directory):
file_paths = [os.path.join(root, filename)
for root, dirs, files in os.walk(directory)
for filename in files]
target_directory = r'E:\MarkDeng\data2train'
output_file_path = get_next_output_file_path(target_directory)
total_files = len(file_paths)
with open(output_file_path, 'x', encoding='utf-8') as f:
for i, file_path in enumerate(file_paths, start=1):
try:
text = convert_to_text(file_path)
f.write(text + '\n')
print(f"Processed {i} out of {total_files} files.")
except Exception as e:
print(f"Error processing file {file_path}: {e}")
7 主函数
if __name__ == '__main__':
# 示例使用
your_directory = r'E:\MarkDeng\data2train\126套技巧教程'
convert_files_in_directory(your_directory)
总结
该代码通过定义不同的函数来提取各种文件格式的文本,并通过遍历目录中的文件来转换所有支持的文件格式,最终将转换结果保存到指定目录的文本文件中。这一过程简化了从多个文件中提取文本的复杂性,并可以很容易地扩展以支持更多的文件格式。
原文地址:https://blog.csdn.net/weixin_47420447/article/details/139856418
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!