Files
resume_data/batcg_extract.py
YangGuo 131e6295e7 上传文件至「/」
求职者人物画像
2026-04-02 11:40:54 +08:00

305 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import time
import os
import logging
from pathlib import Path
from typing import Dict, Any, List
import requests
from tqdm import tqdm # 进度条库,提升批量处理体验
# ===================== 1. 全局配置(请根据实际情况修改) =====================
class Config:
# API配置替换为你的实际密钥/模型)
API_KEY = "你的通义千问API_KEY" # 若用其他模型需同步修改call_llm_api函数
API_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
MODEL = "qwen-turbo" # 轻量模型,性价比最高
TEMPERATURE = 0.0 # 0=确定性输出,降低幻觉
MAX_TOKENS = 2000 # 控制输出长度,降低成本
# 目录配置
INPUT_DIR = "./raw_candidate_files" # 原始评审文件目录(需提前创建)
OUTPUT_DIR = "./extracted_json_results" # 抽取结果保存目录(自动创建)
ERROR_LOG_PATH = "./batch_extract_error.log" # 错误日志文件
# 文件过滤配置
SUPPORTED_EXTENSIONS = [".txt", ".json"] # 仅处理这些后缀的文件
SKIP_EXISTED = True # 跳过已生成JSON的文件避免重复调用API
# 重试配置
MAX_RETRY = 3 # API调用最大重试次数
RETRY_INTERVAL = 1 # 重试间隔(秒)
# ===================== 2. 日志初始化(记录错误) =====================
def init_logger():
"""初始化错误日志,记录处理失败的文件及原因"""
logging.basicConfig(
level=logging.ERROR,
format="%(asctime)s - %(filename)s - %(message)s",
handlers=[
logging.FileHandler(Config.ERROR_LOG_PATH, encoding="utf-8"),
logging.StreamHandler() # 同时输出到控制台
]
)
return logging.getLogger(__name__)
logger = init_logger()
# ===================== 3. 工具函数 =====================
def check_and_create_dirs():
"""检查并创建输入/输出目录"""
Path(Config.INPUT_DIR).mkdir(exist_ok=True)
Path(Config.OUTPUT_DIR).mkdir(exist_ok=True)
logger.info(f"目录检查完成:输入目录={Config.INPUT_DIR},输出目录={Config.OUTPUT_DIR}")
def read_file_content(file_path: str) -> str:
"""读取文件内容,处理编码问题"""
try:
with open(file_path, "r", encoding="utf-8") as f:
return f.read().strip()
except UnicodeDecodeError:
# 兼容GBK编码的文件
with open(file_path, "r", encoding="gbk") as f:
return f.read().strip()
except Exception as e:
raise Exception(f"读取文件失败:{e}")
def build_prompt(raw_text: str) -> str:
"""构建强约束的抽取提示词(核心!保证格式对齐)"""
# 固定模板与你要求的JSON结构100%一致)
template = """
{
"基础信息": {
"最终结果": "Invite/Reject",
"分数": "数字字符串",
"面试岗位": "字符串",
"技术画像": "核心技术特征总结100字内",
"邀请解读": "评审结论总结100字内"
},
"8维度信息": {
"潜力洞察": {
"得分": "数字字符串",
"核心亮点": "字符串",
"短板": "字符串(无则填无)",
"核心依据": "字符串"
},
"叙事逻辑": {
"得分": "数字字符串",
"核心亮点": "字符串",
"短板": "字符串(无则填无)",
"核心依据": "字符串"
},
"软件工艺": {
"得分": "数字字符串",
"核心亮点": "字符串",
"短板": "字符串(无则填无)",
"核心依据": "字符串",
"面试题": "字符串(无则省略该字段)"
},
"编程能力": {
"得分": "数字字符串",
"核心亮点": "字符串",
"短板": "字符串(无则填无)",
"核心依据": "字符串"
},
"综合网络": {
"得分": "数字字符串",
"核心亮点": "字符串",
"短板": "字符串(无则填无)",
"核心依据": "字符串"
},
"电子电路": {
"得分": "数字字符串",
"核心亮点": "字符串",
"短板": "字符串(无则填无)",
"核心依据": "字符串"
},
"操作系统": {
"得分": "数字字符串",
"核心亮点": "字符串",
"短板": "字符串(无则填无)",
"核心依据": "字符串"
},
"算法能力": {
"得分": "数字字符串",
"核心亮点": "字符串",
"短板": "字符串(无则填无)",
"核心依据": "字符串",
"面试题": "字符串(无则省略该字段)"
}
}
}
"""
# 强制约束规则
constraints = """
【强制规则】
1. 输出必须是可直接解析的JSON格式不允许任何多余文字如“以下是抽取结果
2. 字段层级、命名必须与模板100%一致,禁止新增/删减字段、修改字段名;
3. 得分字段必须是数字字符串(如"7.8"),禁止纯数字或其他格式;
4. 核心亮点/短板/核心依据需精简提炼,禁止大段复制原文;
5. 无对应内容的字段填“无”,面试题无则省略该字段。
"""
prompt = f"""
请你严格按照以下模板和规则从下方原始文本中抽取信息并生成JSON
【模板】
{template}
【规则】
{constraints}
【原始文本】
{raw_text}
【输出要求】
仅输出JSON字符串无任何其他内容
"""
return prompt
def call_llm_api(prompt: str) -> str:
"""调用大模型API带重试机制"""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {Config.API_KEY}"
}
data = {
"model": Config.MODEL,
"messages": [{"role": "user", "content": prompt}],
"temperature": Config.TEMPERATURE,
"max_tokens": Config.MAX_TOKENS
}
for retry in range(Config.MAX_RETRY):
try:
response = requests.post(Config.API_URL, headers=headers, json=data, timeout=30)
response.raise_for_status() # 触发HTTP错误
result = response.json()
return result["choices"][0]["message"]["content"].strip()
except Exception as e:
logger.error(f"API调用重试{retry+1}/{Config.MAX_RETRY}失败:{e}")
time.sleep(Config.RETRY_INTERVAL)
raise Exception(f"API调用{Config.MAX_RETRY}次均失败")
def validate_json(raw_json: str) -> Dict[str, Any]:
"""校验JSON结构确保核心字段不缺失"""
try:
data = json.loads(raw_json)
except json.JSONDecodeError as e:
raise ValueError(f"JSON解析失败{e}")
# 校验基础信息核心字段
base_fields = ["最终结果", "分数", "面试岗位", "技术画像", "邀请解读"]
if "基础信息" not in data:
raise ValueError("缺失顶级字段:基础信息")
for field in base_fields:
if field not in data["基础信息"]:
raise ValueError(f"基础信息缺失字段:{field}")
# 校验8维度核心字段
dimensions = ["潜力洞察", "叙事逻辑", "软件工艺", "编程能力", "综合网络", "电子电路", "操作系统", "算法能力"]
dimension_fields = ["得分", "核心亮点", "短板", "核心依据"]
if "8维度信息" not in data:
raise ValueError("缺失顶级字段8维度信息")
for dim in dimensions:
if dim not in data["8维度信息"]:
raise ValueError(f"8维度信息缺失维度{dim}")
for field in dimension_fields:
if field not in data["8维度信息"][dim]:
raise ValueError(f"{dim}缺失字段:{field}")
return data
def process_single_file(file_path: Path) -> bool:
"""处理单个文件读取→调用API→校验→保存"""
# 生成输出文件名(原文件名+json后缀
output_filename = f"{file_path.stem}.json"
output_path = Path(Config.OUTPUT_DIR) / output_filename
# 跳过已处理的文件
if Config.SKIP_EXISTED and output_path.exists():
logger.info(f"跳过已处理文件:{file_path.name}")
return True
try:
# 1. 读取文件内容
raw_text = read_file_content(str(file_path))
if not raw_text:
raise ValueError("文件内容为空")
# 2. 构建提示词
prompt = build_prompt(raw_text)
# 3. 调用API抽取
json_str = call_llm_api(prompt)
# 4. 校验JSON结构
valid_data = validate_json(json_str)
# 5. 保存结果
with open(output_path, "w", encoding="utf-8") as f:
json.dump(valid_data, f, ensure_ascii=False, indent=4)
return True
except Exception as e:
# 记录错误日志
logger.error(f"处理文件{file_path.name}失败:{str(e)}")
return False
# ===================== 4. 批量处理主函数 =====================
def batch_extract():
"""批量处理目录下的所有文件"""
# 1. 检查目录
check_and_create_dirs()
# 2. 获取所有待处理文件
input_files = []
for file in Path(Config.INPUT_DIR).glob("*"):
if file.is_file() and file.suffix in Config.SUPPORTED_EXTENSIONS:
input_files.append(file)
if not input_files:
logger.error(f"输入目录{Config.INPUT_DIR}下无支持的文件(支持后缀:{Config.SUPPORTED_EXTENSIONS}")
return
# 3. 批量处理(带进度条)
success_count = 0
fail_count = 0
with tqdm(total=len(input_files), desc="批量抽取进度") as pbar:
for file in input_files:
if process_single_file(file):
success_count += 1
else:
fail_count += 1
pbar.update(1)
# 4. 输出汇总结果
print("\n" + "="*50)
print(f"批量处理完成!")
print(f"总文件数:{len(input_files)}")
print(f"成功数:{success_count}")
print(f"失败数:{fail_count}")
print(f"失败日志:{Config.ERROR_LOG_PATH}")
print(f"结果目录:{Config.OUTPUT_DIR}")
print("="*50)
# ===================== 5. 适配其他大模型的扩展(可选) =====================
# 若使用OpenAI/GPT
# def call_llm_api(prompt: str) -> str:
# headers = {"Content-Type": "application/json", "Authorization": f"Bearer {Config.API_KEY}"}
# data = {
# "model": "gpt-3.5-turbo",
# "messages": [{"role": "user", "content": prompt}],
# "temperature": 0.0
# }
# response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data)
# return response.json()["choices"][0]["message"]["content"]
# ===================== 6. 主入口 =====================
if __name__ == "__main__":
# 安装依赖(首次运行需执行)
# pip install requests tqdm
# 执行批量抽取
batch_extract()