上传文件至「/」
求职者人物画像
This commit is contained in:
121
batch_resume_pdf_to_txt_clean_final.py
Normal file
121
batch_resume_pdf_to_txt_clean_final.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import pdfplumber
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
# ===================== 终极清洗配置(全平台防伪码通杀) =====================
|
||||
# 终极正则:匹配 任意长度 字母/数字/下划线 组合 + 结尾~~ 的所有无意义字符
|
||||
FAKE_CODE_PATTERN = re.compile(r'[A-Za-z0-9_]{30,}~~')
|
||||
# ==========================================================================
|
||||
|
||||
def clean_resume_text(raw_text: str) -> str:
|
||||
"""
|
||||
终极文本清洗:彻底清除所有平台防伪码、隐形水印、重复乱码
|
||||
1. 通杀所有 长随机字符+~~ 的防伪码
|
||||
2. 清理多余空行、空格,保留简历正常格式
|
||||
"""
|
||||
# 第一步:全局清除所有匹配的防伪乱码(核心根治)
|
||||
cleaned_text = FAKE_CODE_PATTERN.sub('', raw_text)
|
||||
|
||||
# 第二步:清理多余空行(≥3个换行→2个换行,保留段落)
|
||||
cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)
|
||||
|
||||
# 第三步:清理行首行尾空格、多余空格
|
||||
cleaned_text = re.sub(r'^[ ]+|[ ]+$', '', cleaned_text, flags=re.MULTILINE)
|
||||
cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
|
||||
|
||||
return cleaned_text.strip()
|
||||
|
||||
def single_pdf_to_txt(pdf_file_path: Path, save_dir: Path) -> int:
|
||||
"""单文件PDF转TXT + 清洗乱码"""
|
||||
if not pdf_file_path.exists() or pdf_file_path.suffix.lower() != ".pdf":
|
||||
print(f"❌ 无效文件:{pdf_file_path.name}")
|
||||
return -1
|
||||
|
||||
# 同名保存,不修改文件名
|
||||
txt_filename = pdf_file_path.stem + ".txt"
|
||||
txt_save_path = save_dir / txt_filename
|
||||
|
||||
# 已存在则跳过,不重复转换
|
||||
if txt_save_path.exists():
|
||||
print(f"⏭️ 已存在,跳过:{txt_filename}")
|
||||
return 0
|
||||
|
||||
try:
|
||||
full_text = ""
|
||||
with pdfplumber.open(pdf_file_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
# 提取正文,过滤页边隐藏内容
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
full_text += page_text + "\n\n"
|
||||
|
||||
# 终极清洗
|
||||
cleaned_text = clean_resume_text(full_text)
|
||||
|
||||
# 写入干净TXT
|
||||
with open(txt_save_path, "w", encoding="utf-8") as f:
|
||||
f.write(cleaned_text)
|
||||
|
||||
print(f"✅ 转换+清洗完成:{pdf_file_path.name}")
|
||||
return 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 失败:{pdf_file_path.name} | {str(e)}")
|
||||
return -1
|
||||
|
||||
def batch_pdf_to_txt(pdf_input_dir: str, txt_save_dir: str, recursive: bool = True):
|
||||
"""批量PDF转TXT主函数"""
|
||||
input_path = Path(pdf_input_dir)
|
||||
save_path = Path(txt_save_dir)
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"❌ 输入目录不存在:{input_path}")
|
||||
return
|
||||
|
||||
os.makedirs(save_path, exist_ok=True)
|
||||
|
||||
# 扫描所有PDF
|
||||
if recursive:
|
||||
pdf_files = list(input_path.rglob("*.[pP][dD][fF]"))
|
||||
else:
|
||||
pdf_files = list(input_path.glob("*.[pP][dD][fF]"))
|
||||
|
||||
if not pdf_files:
|
||||
print("⚠️ 未找到任何PDF文件")
|
||||
return
|
||||
|
||||
print(f"📄 共找到 {len(pdf_files)} 个PDF,开始处理...\n")
|
||||
|
||||
# 统计结果
|
||||
success = 0
|
||||
skip = 0
|
||||
fail = 0
|
||||
|
||||
for pdf in pdf_files:
|
||||
res = single_pdf_to_txt(pdf, save_path)
|
||||
if res == 1:
|
||||
success +=1
|
||||
elif res ==0:
|
||||
skip +=1
|
||||
else:
|
||||
fail +=1
|
||||
|
||||
# 输出结果
|
||||
print("\n" + "="*50)
|
||||
print("🎉 批量处理完成")
|
||||
print(f"总文件:{len(pdf_files)}")
|
||||
print(f"✅ 成功:{success}")
|
||||
print(f"⏭️ 跳过:{skip}")
|
||||
print(f"❌ 失败:{fail}")
|
||||
print(f"📂 输出目录:{save_path.absolute()}")
|
||||
print("="*50)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# ========== 只需改这3个配置 ==========
|
||||
PDF_INPUT_DIR = "./简历" # 你的PDF文件夹
|
||||
TXT_SAVE_DIR = "./清晰后的简历" # 清洗后的TXT保存目录
|
||||
IS_RECURSIVE = True # 是否递归子文件夹
|
||||
# ====================================
|
||||
|
||||
batch_pdf_to_txt(PDF_INPUT_DIR, TXT_SAVE_DIR, IS_RECURSIVE)
|
||||
Reference in New Issue
Block a user