Files
resume_data/batch_pdf_resume_to_txt.py
YangGuo 131e6295e7 上传文件至「/」
求职者人物画像
2026-04-02 11:40:54 +08:00

57 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import PyPDF2
import os
from pathlib import Path
def specified_pdf_to_txt(pdf_file_path: str, save_dir: str):
"""
规则文件名与原PDF完全一致 | 已存在则跳过不重复转换
:param pdf_file_path: 待转换的PDF完整路径
:param save_dir: TXT文件保存目录
"""
# 1. 校验PDF文件是否存在
pdf_path = Path(pdf_file_path)
if not pdf_path.exists() or pdf_path.suffix.lower() != ".pdf":
print(f"错误文件不存在或不是PDF格式 → {pdf_file_path}")
return False
# 2. 创建保存目录(不存在则自动创建)
os.makedirs(save_dir, exist_ok=True)
# 3. 生成目标TXT路径**保留原PDF文件名仅改后缀**
txt_filename = pdf_path.stem + ".txt"
txt_save_path = Path(save_dir) / txt_filename
# 4. 关键已存在同名TXT → 跳过,不重复转换
if txt_save_path.exists():
print(f"跳过:{txt_filename} 已存在,无需重复转换")
return True
# 5. 执行PDF转TXT
try:
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
full_text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
full_text += page_text + "\n\n"
# 写入TXT到指定目录
with open(txt_save_path, "w", encoding="utf-8") as f:
f.write(full_text)
print(f"✅ 转换成功:{pdf_path.name}{txt_save_path}")
return True
except Exception as e:
print(f"❌ 转换失败:{str(e)}")
return False
if __name__ == "__main__":
# 待转换的【指定PDF文件完整路径】
TARGET_PDF = "./前后端/陈盼良简历(2).pdf"
# TXT文件【指定保存目录】
SAVE_DIRECTORY = "./output/前后端"
# 执行转换
specified_pdf_to_txt(TARGET_PDF, SAVE_DIRECTORY)