Files
resume_data/batch_resume_pdf_to_txt_clean_final.py
YangGuo 131e6295e7 上传文件至「/」
求职者人物画像
2026-04-02 11:40:54 +08:00

121 lines
4.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pdfplumber
import os
import re
from pathlib import Path
# ===================== 终极清洗配置(全平台防伪码通杀) =====================
# 终极正则:匹配 任意长度 字母/数字/下划线 组合 + 结尾~~ 的所有无意义字符
FAKE_CODE_PATTERN = re.compile(r'[A-Za-z0-9_]{30,}~~')
# ==========================================================================
def clean_resume_text(raw_text: str) -> str:
"""
终极文本清洗:彻底清除所有平台防伪码、隐形水印、重复乱码
1. 通杀所有 长随机字符+~~ 的防伪码
2. 清理多余空行、空格,保留简历正常格式
"""
# 第一步:全局清除所有匹配的防伪乱码(核心根治)
cleaned_text = FAKE_CODE_PATTERN.sub('', raw_text)
# 第二步清理多余空行≥3个换行→2个换行保留段落
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
# 第三步:清理行首行尾空格、多余空格
cleaned_text = re.sub(r'^[ ]+|[ ]+$', '', cleaned_text, flags=re.MULTILINE)
cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
return cleaned_text.strip()
def single_pdf_to_txt(pdf_file_path: Path, save_dir: Path) -> int:
"""单文件PDF转TXT + 清洗乱码"""
if not pdf_file_path.exists() or pdf_file_path.suffix.lower() != ".pdf":
print(f"❌ 无效文件:{pdf_file_path.name}")
return -1
# 同名保存,不修改文件名
txt_filename = pdf_file_path.stem + ".txt"
txt_save_path = save_dir / txt_filename
# 已存在则跳过,不重复转换
if txt_save_path.exists():
print(f"⏭️ 已存在,跳过:{txt_filename}")
return 0
try:
full_text = ""
with pdfplumber.open(pdf_file_path) as pdf:
for page in pdf.pages:
# 提取正文,过滤页边隐藏内容
page_text = page.extract_text()
if page_text:
full_text += page_text + "\n\n"
# 终极清洗
cleaned_text = clean_resume_text(full_text)
# 写入干净TXT
with open(txt_save_path, "w", encoding="utf-8") as f:
f.write(cleaned_text)
print(f"✅ 转换+清洗完成:{pdf_file_path.name}")
return 1
except Exception as e:
print(f"❌ 失败:{pdf_file_path.name} | {str(e)}")
return -1
def batch_pdf_to_txt(pdf_input_dir: str, txt_save_dir: str, recursive: bool = True):
"""批量PDF转TXT主函数"""
input_path = Path(pdf_input_dir)
save_path = Path(txt_save_dir)
if not input_path.exists():
print(f"❌ 输入目录不存在:{input_path}")
return
os.makedirs(save_path, exist_ok=True)
# 扫描所有PDF
if recursive:
pdf_files = list(input_path.rglob("*.[pP][dD][fF]"))
else:
pdf_files = list(input_path.glob("*.[pP][dD][fF]"))
if not pdf_files:
print("⚠️ 未找到任何PDF文件")
return
print(f"📄 共找到 {len(pdf_files)} 个PDF开始处理...\n")
# 统计结果
success = 0
skip = 0
fail = 0
for pdf in pdf_files:
res = single_pdf_to_txt(pdf, save_path)
if res == 1:
success +=1
elif res ==0:
skip +=1
else:
fail +=1
# 输出结果
print("\n" + "="*50)
print("🎉 批量处理完成")
print(f"总文件:{len(pdf_files)}")
print(f"✅ 成功:{success}")
print(f"⏭️ 跳过:{skip}")
print(f"❌ 失败:{fail}")
print(f"📂 输出目录:{save_path.absolute()}")
print("="*50)
if __name__ == "__main__":
# ========== 只需改这3个配置 ==========
PDF_INPUT_DIR = "./简历" # 你的PDF文件夹
TXT_SAVE_DIR = "./清晰后的简历" # 清洗后的TXT保存目录
IS_RECURSIVE = True # 是否递归子文件夹
# ====================================
batch_pdf_to_txt(PDF_INPUT_DIR, TXT_SAVE_DIR, IS_RECURSIVE)