57 lines
1.9 KiB
Python
57 lines
1.9 KiB
Python
import PyPDF2
|
||
import os
|
||
from pathlib import Path
|
||
|
||
def specified_pdf_to_txt(pdf_file_path: str, save_dir: str):
|
||
"""
|
||
规则:文件名与原PDF完全一致 | 已存在则跳过不重复转换
|
||
:param pdf_file_path: 待转换的PDF完整路径
|
||
:param save_dir: TXT文件保存目录
|
||
"""
|
||
# 1. 校验PDF文件是否存在
|
||
pdf_path = Path(pdf_file_path)
|
||
if not pdf_path.exists() or pdf_path.suffix.lower() != ".pdf":
|
||
print(f"错误:文件不存在或不是PDF格式 → {pdf_file_path}")
|
||
return False
|
||
|
||
# 2. 创建保存目录(不存在则自动创建)
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
|
||
# 3. 生成目标TXT路径(**保留原PDF文件名,仅改后缀**)
|
||
txt_filename = pdf_path.stem + ".txt"
|
||
txt_save_path = Path(save_dir) / txt_filename
|
||
|
||
# 4. 关键:已存在同名TXT → 跳过,不重复转换
|
||
if txt_save_path.exists():
|
||
print(f"跳过:{txt_filename} 已存在,无需重复转换")
|
||
return True
|
||
|
||
# 5. 执行PDF转TXT
|
||
try:
|
||
with open(pdf_path, "rb") as f:
|
||
reader = PyPDF2.PdfReader(f)
|
||
full_text = ""
|
||
for page in reader.pages:
|
||
page_text = page.extract_text()
|
||
if page_text:
|
||
full_text += page_text + "\n\n"
|
||
|
||
# 写入TXT到指定目录
|
||
with open(txt_save_path, "w", encoding="utf-8") as f:
|
||
f.write(full_text)
|
||
|
||
print(f"✅ 转换成功:{pdf_path.name} → {txt_save_path}")
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f"❌ 转换失败:{str(e)}")
|
||
return False
|
||
|
||
if __name__ == "__main__":
|
||
# 待转换的【指定PDF文件完整路径】
|
||
TARGET_PDF = "./前后端/陈盼良简历(2).pdf"
|
||
# TXT文件【指定保存目录】
|
||
SAVE_DIRECTORY = "./output/前后端"
|
||
|
||
# 执行转换
|
||
specified_pdf_to_txt(TARGET_PDF, SAVE_DIRECTORY) |