fix(crawler): 完善Boss候选人数据解析与简历详情获取

- 新增Candidate.raw_data字段以保存原始数据,便于后续细节获取
- 修改get_candidates调用client方法名和传参,传递原始数据给_parse_candidate
- _parse_candidate方法增强,兼容Boss SDK复杂数据结构,支持多字段解析
- 增加年龄、当前工作、学校等字段的详细解析逻辑
- get_resume_detail改进,支持从raw_data中提取必要参数调用SDK接口
- 统一异常处理并增加详细错误信息提示,确保数据完整性验证
- 统一数据验证失败时打印详细原始数据及转换后内容,方便排查
- scheduler调整Boss简历抓取任务触发间隔为30秒,提高抓取频率
This commit is contained in:
2026-03-24 15:49:11 +08:00
parent b2dde5bbdb
commit 3f261fb3a4
4 changed files with 144 additions and 24 deletions

View File

@@ -2,7 +2,7 @@
from dataclasses import dataclass, field
from datetime import datetime
from decimal import Decimal
from typing import Optional, List
from typing import Optional, List, Any
from enum import Enum
from .enums import Gender
@@ -75,6 +75,9 @@ class Candidate:
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
# 原始数据(用于后续获取详情等操作)
raw_data: Optional[Any] = None
def __post_init__(self):
if self.created_at is None:
self.created_at = datetime.now()

View File

@@ -57,17 +57,46 @@ class BossCrawler(BaseCrawler):
) -> List[Candidate]:
"""获取指定职位下的候选人列表"""
try:
geeks_data = self.client.geek_info(jobid=job_id, page=page)
return [self._parse_candidate(geek_data) for geek_data in geeks_data]
geeks_data = self.client.get_geek_info(jobid=job_id, page=page)
return [self._parse_candidate(geek_data, raw_data=geek_data) for geek_data in geeks_data]
except Exception as e:
print(f"Failed to get candidates from Boss: {e}")
return []
def get_resume_detail(self, candidate: Candidate) -> Optional[Resume]:
"""获取候选人简历详情"""
"""获取候选人简历详情
Args:
candidate: 候选人对象,需要包含 raw_data从 get_geek_info 返回的原始数据)
"""
try:
geek_data = getattr(candidate, 'raw_data', None)
if geek_data is None:
raise ValueError("candidate.raw_data is required to fetch resume detail from Boss")
# 从 geek_data 中提取必要的参数
# geekCard 中包含 encryptJobId, expectId, securityId, lid
geek_card = getattr(geek_data, 'geekCard', None)
if not geek_card:
raise ValueError("geek_data.geekCard is missing")
encrypt_job_id = getattr(geek_card, 'encryptJobId', '')
expect_id = getattr(geek_card, 'expectId', 0)
security_id = getattr(geek_card, 'securityId', '')
lid = getattr(geek_card, 'lid', '')
if not all([encrypt_job_id, expect_id, security_id, lid]):
raise ValueError(f"Missing required parameters for get_geek_detail: "
f"encryptJobId={encrypt_job_id}, expectId={expect_id}, "
f"securityId={security_id}, lid={lid}")
# 获取候选人详情
detail = self.client.get_detail(candidate)
detail = self.client.get_geek_detail(
encryptJobId=encrypt_job_id,
expectId=expect_id,
securityId=security_id,
lid=lid
)
# 解密简历正文
resume_text = self.client.get_detail_text(detail)
@@ -99,35 +128,82 @@ class BossCrawler(BaseCrawler):
status=JobStatus.ACTIVE
)
def _parse_candidate(self, geek_data: Any) -> Candidate:
"""解析候选人数据"""
# 从 SDK 返回的数据中提取候选人信息
source_id = getattr(geek_data, 'geekId', '') or getattr(geek_data, 'encryptGeekId', '')
def _parse_candidate(self, geek_data: Any, raw_data: Any = None) -> Candidate:
"""解析候选人数据
# 解析薪资期望
salary_str = getattr(geek_data, 'salary', '')
Args:
geek_data: 从 SDK 返回的候选人数据
raw_data: 原始数据对象,用于后续获取简历详情等操作
"""
# 获取 geekCardBoss SDK 的数据通常在 geekCard 中)
geek_card = getattr(geek_data, 'geekCard', None) or geek_data
# 调试:打印 geek_data 和 geek_card 的所有属性
print(f"[DEBUG] geek_data type: {type(geek_data)}")
print(f"[DEBUG] geek_data attrs: {dir(geek_data) if hasattr(geek_data, '__dict__') else 'no __dict__'}")
if geek_card is not geek_data:
print(f"[DEBUG] geek_card type: {type(geek_card)}")
print(f"[DEBUG] geek_card attrs: {dir(geek_card) if hasattr(geek_card, '__dict__') else 'no __dict__'}")
# 从 SDK 返回的数据中提取候选人信息
source_id = (getattr(geek_data, 'geekId', '') or
getattr(geek_data, 'encryptGeekId', '') or
getattr(geek_card, 'geekId', '') or
getattr(geek_card, 'encryptGeekId', ''))
# 解析姓名Boss SDK 使用 geekName 字段)
name = (getattr(geek_card, 'geekName', '') or
getattr(geek_data, 'geekName', ''))
print(f"[DEBUG] Parsed name: '{name}', source_id: '{source_id}'")
# 解析薪资期望Boss SDK 使用 salary 或 lowSalary/highSalary
salary_str = (getattr(geek_card, 'salary', '') or
getattr(geek_data, 'salary', ''))
salary_range = self._parse_salary_range(salary_str)
# 解析性别
gender = self._parse_gender(getattr(geek_data, 'gender', ''))
# 解析性别Boss SDK 使用 geekGender
gender = self._parse_gender(
getattr(geek_card, 'geekGender', '') or
getattr(geek_data, 'geekGender', '')
)
# 解析工作年限
work_years = self._parse_work_years(getattr(geek_data, 'workYears', ''))
# 解析工作年限Boss SDK 使用 geekWorkYear
work_years = self._parse_work_years(
getattr(geek_card, 'geekWorkYear', '') or
getattr(geek_data, 'geekWorkYear', '')
)
# 解析年龄Boss SDK 使用 ageDesc 如 "22岁"
age_desc = getattr(geek_card, 'ageDesc', '') or getattr(geek_data, 'ageDesc', '')
age = self._parse_age_from_desc(age_desc)
# 解析学历Boss SDK 使用 geekDegree 或 geekEdu
education = (getattr(geek_card, 'geekDegree', None) or
getattr(geek_card, 'geekEdu', None) or
getattr(geek_data, 'geekDegree', None))
# 解析当前公司(从 geekWorks 或 geekDoneWorks 获取最近的工作经历)
current_company, current_position = self._parse_current_work(geek_card)
# 解析学校(从 geekEdus 获取最高学历的学校)
school = self._parse_school(geek_card)
return Candidate(
source=CandidateSource.BOSS,
source_id=str(source_id),
name=getattr(geek_data, 'name', ''),
name=name,
gender=gender,
age=getattr(geek_data, 'age', None),
location=getattr(geek_data, 'location', None),
current_company=getattr(geek_data, 'company', None),
current_position=getattr(geek_data, 'position', None),
age=age,
location=getattr(geek_card, 'expectLocationName', None) or getattr(geek_data, 'expectLocationName', None),
current_company=current_company,
current_position=current_position,
work_years=work_years,
education=getattr(geek_data, 'education', None),
school=getattr(geek_data, 'school', None),
education=education,
school=school,
salary_expectation=salary_range,
status=CandidateStatus.NEW
status=CandidateStatus.NEW,
raw_data=raw_data
)
def _parse_resume_text(self, resume_text: str) -> ResumeParsed:
@@ -205,3 +281,38 @@ class BossCrawler(BaseCrawler):
if match:
return Decimal(match.group(1))
return None
def _parse_age_from_desc(self, age_desc: str) -> Optional[int]:
"""从年龄描述中解析年龄(如 '22岁' -> 22"""
if not age_desc:
return None
match = re.search(r'(\d+)', str(age_desc))
if match:
return int(match.group(1))
return None
def _parse_current_work(self, geek_card: Any) -> tuple:
"""解析当前工作信息(公司、职位)"""
# 从 geekWorks 或 geekDoneWorks 获取最近的工作经历
works = getattr(geek_card, 'geekWorks', None) or getattr(geek_card, 'geekDoneWorks', None)
if works and len(works) > 0:
# 取第一个(最近的工作)
latest_work = works[0]
company = getattr(latest_work, 'companyName', None)
position = getattr(latest_work, 'positionName', None)
return company, position
return None, None
def _parse_school(self, geek_card: Any) -> Optional[str]:
"""解析学校信息"""
# 从 geekEdus 获取最高学历的学校
edus = getattr(geek_card, 'geekEdus', None)
if edus and len(edus) > 0:
# 取第一个(最高的学历)
highest_edu = edus[0]
return getattr(highest_edu, 'schoolName', None)
# 尝试 geekHighestDegreeEdu
highest = getattr(geek_card, 'geekHighestDegreeEdu', None)
if highest:
return getattr(highest, 'schoolName', None)
return None

View File

@@ -137,6 +137,12 @@ class UnifiedIngestionService:
# 2. 数据验证
validation_result = self._validate(normalized)
if not validation_result.is_valid:
# 打印原始数据和标准化数据,方便排查
print(f"[数据验证失败] 错误: {validation_result.error_messages}")
print(f"[数据验证失败] 原始数据: {raw_data}")
print(f"[数据验证失败] 标准化后候选人: name={normalized.candidate.name}, "
f"source={normalized.candidate.source}, source_id={normalized.candidate.source_id}")
print(f"[数据验证失败] 标准化后简历: raw_content长度={len(normalized.resume.raw_content) if normalized.resume.raw_content else 0}")
return IngestionResult.failed_result(
errors=validation_result.error_messages,
message="数据验证失败"

View File

@@ -45,7 +45,7 @@ class CrawlScheduler:
# 每30分钟爬取一次 Boss 直聘
self.scheduler.add_job(
self._crawl_boss,
trigger=IntervalTrigger(minutes=1),
trigger=IntervalTrigger(seconds=30),
id="crawl_boss",
name="爬取Boss直聘简历",
replace_existing=True