fix(crawler): 完善Boss候选人数据解析与简历详情获取
- 新增Candidate.raw_data字段以保存原始数据,便于后续细节获取 - 修改get_candidates调用client方法名和传参,传递原始数据给_parse_candidate - _parse_candidate方法增强,兼容Boss SDK复杂数据结构,支持多字段解析 - 增加年龄、当前工作、学校等字段的详细解析逻辑 - get_resume_detail改进,支持从raw_data中提取必要参数调用SDK接口 - 统一异常处理并增加详细错误信息提示,确保数据完整性验证 - 统一数据验证失败时打印详细原始数据及转换后内容,方便排查 - scheduler调整Boss简历抓取任务触发间隔为30秒,提高抓取频率
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Any
|
||||
from enum import Enum
|
||||
|
||||
from .enums import Gender
|
||||
@@ -75,6 +75,9 @@ class Candidate:
|
||||
created_at: Optional[datetime] = None
|
||||
updated_at: Optional[datetime] = None
|
||||
|
||||
# 原始数据(用于后续获取详情等操作)
|
||||
raw_data: Optional[Any] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.created_at is None:
|
||||
self.created_at = datetime.now()
|
||||
|
||||
@@ -57,17 +57,46 @@ class BossCrawler(BaseCrawler):
|
||||
) -> List[Candidate]:
|
||||
"""获取指定职位下的候选人列表"""
|
||||
try:
|
||||
geeks_data = self.client.geek_info(jobid=job_id, page=page)
|
||||
return [self._parse_candidate(geek_data) for geek_data in geeks_data]
|
||||
geeks_data = self.client.get_geek_info(jobid=job_id, page=page)
|
||||
return [self._parse_candidate(geek_data, raw_data=geek_data) for geek_data in geeks_data]
|
||||
except Exception as e:
|
||||
print(f"Failed to get candidates from Boss: {e}")
|
||||
return []
|
||||
|
||||
def get_resume_detail(self, candidate: Candidate) -> Optional[Resume]:
|
||||
"""获取候选人简历详情"""
|
||||
"""获取候选人简历详情
|
||||
|
||||
Args:
|
||||
candidate: 候选人对象,需要包含 raw_data(从 get_geek_info 返回的原始数据)
|
||||
"""
|
||||
try:
|
||||
geek_data = getattr(candidate, 'raw_data', None)
|
||||
if geek_data is None:
|
||||
raise ValueError("candidate.raw_data is required to fetch resume detail from Boss")
|
||||
|
||||
# 从 geek_data 中提取必要的参数
|
||||
# geekCard 中包含 encryptJobId, expectId, securityId, lid
|
||||
geek_card = getattr(geek_data, 'geekCard', None)
|
||||
if not geek_card:
|
||||
raise ValueError("geek_data.geekCard is missing")
|
||||
|
||||
encrypt_job_id = getattr(geek_card, 'encryptJobId', '')
|
||||
expect_id = getattr(geek_card, 'expectId', 0)
|
||||
security_id = getattr(geek_card, 'securityId', '')
|
||||
lid = getattr(geek_card, 'lid', '')
|
||||
|
||||
if not all([encrypt_job_id, expect_id, security_id, lid]):
|
||||
raise ValueError(f"Missing required parameters for get_geek_detail: "
|
||||
f"encryptJobId={encrypt_job_id}, expectId={expect_id}, "
|
||||
f"securityId={security_id}, lid={lid}")
|
||||
|
||||
# 获取候选人详情
|
||||
detail = self.client.get_detail(candidate)
|
||||
detail = self.client.get_geek_detail(
|
||||
encryptJobId=encrypt_job_id,
|
||||
expectId=expect_id,
|
||||
securityId=security_id,
|
||||
lid=lid
|
||||
)
|
||||
|
||||
# 解密简历正文
|
||||
resume_text = self.client.get_detail_text(detail)
|
||||
@@ -99,35 +128,82 @@ class BossCrawler(BaseCrawler):
|
||||
status=JobStatus.ACTIVE
|
||||
)
|
||||
|
||||
def _parse_candidate(self, geek_data: Any) -> Candidate:
|
||||
"""解析候选人数据"""
|
||||
# 从 SDK 返回的数据中提取候选人信息
|
||||
source_id = getattr(geek_data, 'geekId', '') or getattr(geek_data, 'encryptGeekId', '')
|
||||
def _parse_candidate(self, geek_data: Any, raw_data: Any = None) -> Candidate:
|
||||
"""解析候选人数据
|
||||
|
||||
# 解析薪资期望
|
||||
salary_str = getattr(geek_data, 'salary', '')
|
||||
Args:
|
||||
geek_data: 从 SDK 返回的候选人数据
|
||||
raw_data: 原始数据对象,用于后续获取简历详情等操作
|
||||
"""
|
||||
# 获取 geekCard(Boss SDK 的数据通常在 geekCard 中)
|
||||
geek_card = getattr(geek_data, 'geekCard', None) or geek_data
|
||||
|
||||
# 调试:打印 geek_data 和 geek_card 的所有属性
|
||||
print(f"[DEBUG] geek_data type: {type(geek_data)}")
|
||||
print(f"[DEBUG] geek_data attrs: {dir(geek_data) if hasattr(geek_data, '__dict__') else 'no __dict__'}")
|
||||
if geek_card is not geek_data:
|
||||
print(f"[DEBUG] geek_card type: {type(geek_card)}")
|
||||
print(f"[DEBUG] geek_card attrs: {dir(geek_card) if hasattr(geek_card, '__dict__') else 'no __dict__'}")
|
||||
|
||||
# 从 SDK 返回的数据中提取候选人信息
|
||||
source_id = (getattr(geek_data, 'geekId', '') or
|
||||
getattr(geek_data, 'encryptGeekId', '') or
|
||||
getattr(geek_card, 'geekId', '') or
|
||||
getattr(geek_card, 'encryptGeekId', ''))
|
||||
|
||||
# 解析姓名(Boss SDK 使用 geekName 字段)
|
||||
name = (getattr(geek_card, 'geekName', '') or
|
||||
getattr(geek_data, 'geekName', ''))
|
||||
|
||||
print(f"[DEBUG] Parsed name: '{name}', source_id: '{source_id}'")
|
||||
|
||||
# 解析薪资期望(Boss SDK 使用 salary 或 lowSalary/highSalary)
|
||||
salary_str = (getattr(geek_card, 'salary', '') or
|
||||
getattr(geek_data, 'salary', ''))
|
||||
salary_range = self._parse_salary_range(salary_str)
|
||||
|
||||
# 解析性别
|
||||
gender = self._parse_gender(getattr(geek_data, 'gender', ''))
|
||||
# 解析性别(Boss SDK 使用 geekGender)
|
||||
gender = self._parse_gender(
|
||||
getattr(geek_card, 'geekGender', '') or
|
||||
getattr(geek_data, 'geekGender', '')
|
||||
)
|
||||
|
||||
# 解析工作年限
|
||||
work_years = self._parse_work_years(getattr(geek_data, 'workYears', ''))
|
||||
# 解析工作年限(Boss SDK 使用 geekWorkYear)
|
||||
work_years = self._parse_work_years(
|
||||
getattr(geek_card, 'geekWorkYear', '') or
|
||||
getattr(geek_data, 'geekWorkYear', '')
|
||||
)
|
||||
|
||||
# 解析年龄(Boss SDK 使用 ageDesc 如 "22岁")
|
||||
age_desc = getattr(geek_card, 'ageDesc', '') or getattr(geek_data, 'ageDesc', '')
|
||||
age = self._parse_age_from_desc(age_desc)
|
||||
|
||||
# 解析学历(Boss SDK 使用 geekDegree 或 geekEdu)
|
||||
education = (getattr(geek_card, 'geekDegree', None) or
|
||||
getattr(geek_card, 'geekEdu', None) or
|
||||
getattr(geek_data, 'geekDegree', None))
|
||||
|
||||
# 解析当前公司(从 geekWorks 或 geekDoneWorks 获取最近的工作经历)
|
||||
current_company, current_position = self._parse_current_work(geek_card)
|
||||
|
||||
# 解析学校(从 geekEdus 获取最高学历的学校)
|
||||
school = self._parse_school(geek_card)
|
||||
|
||||
return Candidate(
|
||||
source=CandidateSource.BOSS,
|
||||
source_id=str(source_id),
|
||||
name=getattr(geek_data, 'name', ''),
|
||||
name=name,
|
||||
gender=gender,
|
||||
age=getattr(geek_data, 'age', None),
|
||||
location=getattr(geek_data, 'location', None),
|
||||
current_company=getattr(geek_data, 'company', None),
|
||||
current_position=getattr(geek_data, 'position', None),
|
||||
age=age,
|
||||
location=getattr(geek_card, 'expectLocationName', None) or getattr(geek_data, 'expectLocationName', None),
|
||||
current_company=current_company,
|
||||
current_position=current_position,
|
||||
work_years=work_years,
|
||||
education=getattr(geek_data, 'education', None),
|
||||
school=getattr(geek_data, 'school', None),
|
||||
education=education,
|
||||
school=school,
|
||||
salary_expectation=salary_range,
|
||||
status=CandidateStatus.NEW
|
||||
status=CandidateStatus.NEW,
|
||||
raw_data=raw_data
|
||||
)
|
||||
|
||||
def _parse_resume_text(self, resume_text: str) -> ResumeParsed:
|
||||
@@ -205,3 +281,38 @@ class BossCrawler(BaseCrawler):
|
||||
if match:
|
||||
return Decimal(match.group(1))
|
||||
return None
|
||||
|
||||
def _parse_age_from_desc(self, age_desc: str) -> Optional[int]:
|
||||
"""从年龄描述中解析年龄(如 '22岁' -> 22)"""
|
||||
if not age_desc:
|
||||
return None
|
||||
match = re.search(r'(\d+)', str(age_desc))
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
return None
|
||||
|
||||
def _parse_current_work(self, geek_card: Any) -> tuple:
|
||||
"""解析当前工作信息(公司、职位)"""
|
||||
# 从 geekWorks 或 geekDoneWorks 获取最近的工作经历
|
||||
works = getattr(geek_card, 'geekWorks', None) or getattr(geek_card, 'geekDoneWorks', None)
|
||||
if works and len(works) > 0:
|
||||
# 取第一个(最近的工作)
|
||||
latest_work = works[0]
|
||||
company = getattr(latest_work, 'companyName', None)
|
||||
position = getattr(latest_work, 'positionName', None)
|
||||
return company, position
|
||||
return None, None
|
||||
|
||||
def _parse_school(self, geek_card: Any) -> Optional[str]:
|
||||
"""解析学校信息"""
|
||||
# 从 geekEdus 获取最高学历的学校
|
||||
edus = getattr(geek_card, 'geekEdus', None)
|
||||
if edus and len(edus) > 0:
|
||||
# 取第一个(最高的学历)
|
||||
highest_edu = edus[0]
|
||||
return getattr(highest_edu, 'schoolName', None)
|
||||
# 尝试 geekHighestDegreeEdu
|
||||
highest = getattr(geek_card, 'geekHighestDegreeEdu', None)
|
||||
if highest:
|
||||
return getattr(highest, 'schoolName', None)
|
||||
return None
|
||||
|
||||
@@ -137,6 +137,12 @@ class UnifiedIngestionService:
|
||||
# 2. 数据验证
|
||||
validation_result = self._validate(normalized)
|
||||
if not validation_result.is_valid:
|
||||
# 打印原始数据和标准化数据,方便排查
|
||||
print(f"[数据验证失败] 错误: {validation_result.error_messages}")
|
||||
print(f"[数据验证失败] 原始数据: {raw_data}")
|
||||
print(f"[数据验证失败] 标准化后候选人: name={normalized.candidate.name}, "
|
||||
f"source={normalized.candidate.source}, source_id={normalized.candidate.source_id}")
|
||||
print(f"[数据验证失败] 标准化后简历: raw_content长度={len(normalized.resume.raw_content) if normalized.resume.raw_content else 0}")
|
||||
return IngestionResult.failed_result(
|
||||
errors=validation_result.error_messages,
|
||||
message="数据验证失败"
|
||||
|
||||
@@ -45,7 +45,7 @@ class CrawlScheduler:
|
||||
# 每30分钟爬取一次 Boss 直聘
|
||||
self.scheduler.add_job(
|
||||
self._crawl_boss,
|
||||
trigger=IntervalTrigger(minutes=1),
|
||||
trigger=IntervalTrigger(seconds=30),
|
||||
id="crawl_boss",
|
||||
name="爬取Boss直聘简历",
|
||||
replace_existing=True
|
||||
|
||||
Reference in New Issue
Block a user