import pdfplumber import os import re from pathlib import Path # ===================== 终极清洗配置(全平台防伪码通杀) ===================== # 终极正则:匹配 任意长度 字母/数字/下划线 组合 + 结尾~~ 的所有无意义字符 FAKE_CODE_PATTERN = re.compile(r'[A-Za-z0-9_]{30,}~~') # ========================================================================== def clean_resume_text(raw_text: str) -> str: """ 终极文本清洗:彻底清除所有平台防伪码、隐形水印、重复乱码 1. 通杀所有 长随机字符+~~ 的防伪码 2. 清理多余空行、空格,保留简历正常格式 """ # 第一步:全局清除所有匹配的防伪乱码(核心根治) cleaned_text = FAKE_CODE_PATTERN.sub('', raw_text) # 第二步:清理多余空行(≥3个换行→2个换行,保留段落) cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text) # 第三步:清理行首行尾空格、多余空格 cleaned_text = re.sub(r'^[ ]+|[ ]+$', '', cleaned_text, flags=re.MULTILINE) cleaned_text = re.sub(r' {2,}', ' ', cleaned_text) return cleaned_text.strip() def single_pdf_to_txt(pdf_file_path: Path, save_dir: Path) -> int: """单文件PDF转TXT + 清洗乱码""" if not pdf_file_path.exists() or pdf_file_path.suffix.lower() != ".pdf": print(f"❌ 无效文件:{pdf_file_path.name}") return -1 # 同名保存,不修改文件名 txt_filename = pdf_file_path.stem + ".txt" txt_save_path = save_dir / txt_filename # 已存在则跳过,不重复转换 if txt_save_path.exists(): print(f"⏭️ 已存在,跳过:{txt_filename}") return 0 try: full_text = "" with pdfplumber.open(pdf_file_path) as pdf: for page in pdf.pages: # 提取正文,过滤页边隐藏内容 page_text = page.extract_text() if page_text: full_text += page_text + "\n\n" # 终极清洗 cleaned_text = clean_resume_text(full_text) # 写入干净TXT with open(txt_save_path, "w", encoding="utf-8") as f: f.write(cleaned_text) print(f"✅ 转换+清洗完成:{pdf_file_path.name}") return 1 except Exception as e: print(f"❌ 失败:{pdf_file_path.name} | {str(e)}") return -1 def batch_pdf_to_txt(pdf_input_dir: str, txt_save_dir: str, recursive: bool = True): """批量PDF转TXT主函数""" input_path = Path(pdf_input_dir) save_path = Path(txt_save_dir) if not input_path.exists(): print(f"❌ 输入目录不存在:{input_path}") return os.makedirs(save_path, exist_ok=True) # 扫描所有PDF if recursive: pdf_files = list(input_path.rglob("*.[pP][dD][fF]")) else: pdf_files = list(input_path.glob("*.[pP][dD][fF]")) if not pdf_files: print("⚠️ 未找到任何PDF文件") return print(f"📄 共找到 {len(pdf_files)} 个PDF,开始处理...\n") # 统计结果 success = 0 skip = 0 fail = 0 for pdf in pdf_files: res = single_pdf_to_txt(pdf, save_path) if res == 1: success +=1 elif res ==0: skip +=1 else: fail +=1 # 输出结果 print("\n" + "="*50) print("🎉 批量处理完成") print(f"总文件:{len(pdf_files)}") print(f"✅ 成功:{success}") print(f"⏭️ 跳过:{skip}") print(f"❌ 失败:{fail}") print(f"📂 输出目录:{save_path.absolute()}") print("="*50) if __name__ == "__main__": # ========== 只需改这3个配置 ========== PDF_INPUT_DIR = "./简历" # 你的PDF文件夹 TXT_SAVE_DIR = "./清晰后的简历" # 清洗后的TXT保存目录 IS_RECURSIVE = True # 是否递归子文件夹 # ==================================== batch_pdf_to_txt(PDF_INPUT_DIR, TXT_SAVE_DIR, IS_RECURSIVE)