自学内容网 自学内容网

常见资料文件转换为 TXT 文件

代码概述

        该代码旨在遍历指定目录中的所有文件,将支持的文件格式(如PDF、DOC、DOCX、PPT、PPTX、XLS、XLSX、TXT)转换为文本文件并保存。使用不同的库处理不同的文件格式,如fitz用于PDF,docx用于DOCX,comtypes用于PPT,openpyxl用于XLSX,xlrd用于XLS,aspose-words用于DOC等。

安装依赖库

在运行代码之前,需要安装以下Python库:

注意:Python版本为3.9

pip install pymupdf python-docx chardet python-pptx xlrd openpyxl comtypes aspose-words
代码解释

1 导入必要的库
import os
import fitz  # PyMuPDF
import docx
import chardet
from pptx import Presentation
import xlrd
import openpyxl
import comtypes.client
import aspose.words as aw
2 定义提取文本的函数

为每种文件格式定义一个函数来提取文本:

从PPT文件提取文本

def extract_text_from_ppt(file_path):
    powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
    powerpoint.Visible = 1
    abs_file_path = os.path.abspath(file_path)
    slides = powerpoint.Presentations.Open(abs_file_path).Slides
    text_runs = []
    for slide in slides:
        for shape in slide.Shapes:
            if shape.HasTextFrame and shape.TextFrame.TextRange.Paragraphs().Count > 0:
                try:
                    for paragraph in shape.TextFrame.TextRange.Paragraphs():
                        for run in paragraph.Runs():
                            text_runs.append(run.Text)
                except Exception as e:
                    print(f"Error processing shape in slide: {e}")
    powerpoint.Quit()
    return '\n'.join(text_runs)

从PDF文件提取文本

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

从DOCX文件提取文本

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    text = [paragraph.text for paragraph in doc.paragraphs]
    return "\n".join(text)

从PPTX文件提取文本

def extract_text_from_pptx(file_path):
    prs = Presentation(file_path)
    text_runs = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                for paragraph in shape.text_frame.paragraphs:
                    for run in paragraph.runs:
                        text_runs.append(run.text)
    return '\n'.join(text_runs)

从XLSX文件提取文本

def extract_text_from_xlsx(file_path):
    workbook = openpyxl.load_workbook(file_path)
    sheets = workbook.sheetnames
    text = []
    for sheet_name in sheets:
        sheet = workbook[sheet_name]
        for row in sheet.iter_rows(values_only=True):
            text.append("\t".join([str(cell) if cell is not None else "" for cell in row]))
    return "\n".join(text)

从XLS文件提取文本

def extract_text_from_xls(file_path):
    workbook = xlrd.open_workbook(file_path)
    text = []
    for sheet in workbook.sheets():
        for row_idx in range(sheet.nrows):
            row = sheet.row(row_idx)
            text.append("\t".join([str(cell.value) for cell in row]))
    return "\n".join(text)

从DOC文件提取文本

def extract_text_from_doc(file_path):
    try:
        doc = aw.Document(file_path)
        text = doc.get_text()
        return text
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return ""

从TXT文件提取文本

def extract_text_from_txt(file_path):
    rawdata = open(file_path, 'rb').read()
    result = chardet.detect(rawdata)
    encoding = result['encoding']
    with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
        return f.read()

3 文件扩展名与提取函数的映射
EXTRACTORS = {
    '.pdf': extract_text_from_pdf,
    '.docx': extract_text_from_docx,
    '.pptx': extract_text_from_pptx,
    '.ppt': extract_text_from_ppt,
    '.xlsx': extract_text_from_xlsx,
    '.xls': extract_text_from_xls,
    '.doc': extract_text_from_doc,
    '.txt': extract_text_from_txt,
}
4 生成输出文件路径的函数
def get_next_output_file_path(directory):
    files = os.listdir(directory)
    txt_files = [f for f in files if f.endswith('.txt')]
    num_txt_files = len(txt_files)
    num_str = str(num_txt_files).zfill(3)
    output_file_path = os.path.join(directory, f'output_{num_str}.txt')
    return output_file_path
5 文件转换函数
def convert_to_text(file_path: str) -> str:
    _, ext = os.path.splitext(file_path)
    extractor = EXTRACTORS.get(ext)
    if extractor is None:
        raise ValueError(f"Unsupported file format: {ext}")
    return extractor(file_path)
6 遍历目录并转换文件的函数
def convert_files_in_directory(directory):
    file_paths = [os.path.join(root, filename)
                  for root, dirs, files in os.walk(directory)
                  for filename in files]
    target_directory = r'E:\MarkDeng\data2train'
    output_file_path = get_next_output_file_path(target_directory)
    total_files = len(file_paths)
    with open(output_file_path, 'x', encoding='utf-8') as f:
        for i, file_path in enumerate(file_paths, start=1):
            try:
                text = convert_to_text(file_path)
                f.write(text + '\n')
                print(f"Processed {i} out of {total_files} files.")
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
7 主函数

if __name__ == '__main__':
    # 示例使用
    your_directory = r'E:\MarkDeng\data2train\126套技巧教程'
    convert_files_in_directory(your_directory)

总结

        该代码通过定义不同的函数来提取各种文件格式的文本,并通过遍历目录中的文件来转换所有支持的文件格式,最终将转换结果保存到指定目录的文本文件中。这一过程简化了从多个文件中提取文本的复杂性,并可以很容易地扩展以支持更多的文件格式。


原文地址:https://blog.csdn.net/weixin_47420447/article/details/139856418

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!