From 3f261fb3a40f5b55e4dc24a8f520cd8f91a9bf6d Mon Sep 17 00:00:00 2001 From: JiaoTianBo Date: Tue, 24 Mar 2026 15:49:11 +0800 Subject: [PATCH] =?UTF-8?q?fix(crawler):=20=E5=AE=8C=E5=96=84Boss=E5=80=99?= =?UTF-8?q?=E9=80=89=E4=BA=BA=E6=95=B0=E6=8D=AE=E8=A7=A3=E6=9E=90=E4=B8=8E?= =?UTF-8?q?=E7=AE=80=E5=8E=86=E8=AF=A6=E6=83=85=E8=8E=B7=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增Candidate.raw_data字段以保存原始数据,便于后续细节获取 - 修改get_candidates调用client方法名和传参,传递原始数据给_parse_candidate - _parse_candidate方法增强,兼容Boss SDK复杂数据结构,支持多字段解析 - 增加年龄、当前工作、学校等字段的详细解析逻辑 - get_resume_detail改进,支持从raw_data中提取必要参数调用SDK接口 - 统一异常处理并增加详细错误信息提示,确保数据完整性验证 - 统一数据验证失败时打印详细原始数据及转换后内容,方便排查 - scheduler调整Boss简历抓取任务触发间隔为30秒,提高抓取频率 --- .../yinlihupo/ylhp_hr_2_0/domain/candidate.py | 5 +- .../service/crawler/boss_crawler.py | 155 +++++++++++++++--- .../ingestion/unified_ingestion_service.py | 6 + .../ylhp_hr_2_0/service/scheduler.py | 2 +- 4 files changed, 144 insertions(+), 24 deletions(-) diff --git a/src/main/python/cn/yinlihupo/ylhp_hr_2_0/domain/candidate.py b/src/main/python/cn/yinlihupo/ylhp_hr_2_0/domain/candidate.py index 6b76bc2..2165dc1 100644 --- a/src/main/python/cn/yinlihupo/ylhp_hr_2_0/domain/candidate.py +++ b/src/main/python/cn/yinlihupo/ylhp_hr_2_0/domain/candidate.py @@ -2,7 +2,7 @@ from dataclasses import dataclass, field from datetime import datetime from decimal import Decimal -from typing import Optional, List +from typing import Optional, List, Any from enum import Enum from .enums import Gender @@ -75,6 +75,9 @@ class Candidate: created_at: Optional[datetime] = None updated_at: Optional[datetime] = None + # 原始数据(用于后续获取详情等操作) + raw_data: Optional[Any] = None + def __post_init__(self): if self.created_at is None: self.created_at = datetime.now() diff --git a/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/crawler/boss_crawler.py b/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/crawler/boss_crawler.py index 5448fd9..f9f8c42 100644 --- a/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/crawler/boss_crawler.py +++ b/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/crawler/boss_crawler.py @@ -57,17 +57,46 @@ class BossCrawler(BaseCrawler): ) -> List[Candidate]: """获取指定职位下的候选人列表""" try: - geeks_data = self.client.geek_info(jobid=job_id, page=page) - return [self._parse_candidate(geek_data) for geek_data in geeks_data] + geeks_data = self.client.get_geek_info(jobid=job_id, page=page) + return [self._parse_candidate(geek_data, raw_data=geek_data) for geek_data in geeks_data] except Exception as e: print(f"Failed to get candidates from Boss: {e}") return [] def get_resume_detail(self, candidate: Candidate) -> Optional[Resume]: - """获取候选人简历详情""" + """获取候选人简历详情 + + Args: + candidate: 候选人对象,需要包含 raw_data(从 get_geek_info 返回的原始数据) + """ try: + geek_data = getattr(candidate, 'raw_data', None) + if geek_data is None: + raise ValueError("candidate.raw_data is required to fetch resume detail from Boss") + + # 从 geek_data 中提取必要的参数 + # geekCard 中包含 encryptJobId, expectId, securityId, lid + geek_card = getattr(geek_data, 'geekCard', None) + if not geek_card: + raise ValueError("geek_data.geekCard is missing") + + encrypt_job_id = getattr(geek_card, 'encryptJobId', '') + expect_id = getattr(geek_card, 'expectId', 0) + security_id = getattr(geek_card, 'securityId', '') + lid = getattr(geek_card, 'lid', '') + + if not all([encrypt_job_id, expect_id, security_id, lid]): + raise ValueError(f"Missing required parameters for get_geek_detail: " + f"encryptJobId={encrypt_job_id}, expectId={expect_id}, " + f"securityId={security_id}, lid={lid}") + # 获取候选人详情 - detail = self.client.get_detail(candidate) + detail = self.client.get_geek_detail( + encryptJobId=encrypt_job_id, + expectId=expect_id, + securityId=security_id, + lid=lid + ) # 解密简历正文 resume_text = self.client.get_detail_text(detail) @@ -99,35 +128,82 @@ class BossCrawler(BaseCrawler): status=JobStatus.ACTIVE ) - def _parse_candidate(self, geek_data: Any) -> Candidate: - """解析候选人数据""" - # 从 SDK 返回的数据中提取候选人信息 - source_id = getattr(geek_data, 'geekId', '') or getattr(geek_data, 'encryptGeekId', '') + def _parse_candidate(self, geek_data: Any, raw_data: Any = None) -> Candidate: + """解析候选人数据 - # 解析薪资期望 - salary_str = getattr(geek_data, 'salary', '') + Args: + geek_data: 从 SDK 返回的候选人数据 + raw_data: 原始数据对象,用于后续获取简历详情等操作 + """ + # 获取 geekCard(Boss SDK 的数据通常在 geekCard 中) + geek_card = getattr(geek_data, 'geekCard', None) or geek_data + + # 调试:打印 geek_data 和 geek_card 的所有属性 + print(f"[DEBUG] geek_data type: {type(geek_data)}") + print(f"[DEBUG] geek_data attrs: {dir(geek_data) if hasattr(geek_data, '__dict__') else 'no __dict__'}") + if geek_card is not geek_data: + print(f"[DEBUG] geek_card type: {type(geek_card)}") + print(f"[DEBUG] geek_card attrs: {dir(geek_card) if hasattr(geek_card, '__dict__') else 'no __dict__'}") + + # 从 SDK 返回的数据中提取候选人信息 + source_id = (getattr(geek_data, 'geekId', '') or + getattr(geek_data, 'encryptGeekId', '') or + getattr(geek_card, 'geekId', '') or + getattr(geek_card, 'encryptGeekId', '')) + + # 解析姓名(Boss SDK 使用 geekName 字段) + name = (getattr(geek_card, 'geekName', '') or + getattr(geek_data, 'geekName', '')) + + print(f"[DEBUG] Parsed name: '{name}', source_id: '{source_id}'") + + # 解析薪资期望(Boss SDK 使用 salary 或 lowSalary/highSalary) + salary_str = (getattr(geek_card, 'salary', '') or + getattr(geek_data, 'salary', '')) salary_range = self._parse_salary_range(salary_str) - # 解析性别 - gender = self._parse_gender(getattr(geek_data, 'gender', '')) + # 解析性别(Boss SDK 使用 geekGender) + gender = self._parse_gender( + getattr(geek_card, 'geekGender', '') or + getattr(geek_data, 'geekGender', '') + ) - # 解析工作年限 - work_years = self._parse_work_years(getattr(geek_data, 'workYears', '')) + # 解析工作年限(Boss SDK 使用 geekWorkYear) + work_years = self._parse_work_years( + getattr(geek_card, 'geekWorkYear', '') or + getattr(geek_data, 'geekWorkYear', '') + ) + + # 解析年龄(Boss SDK 使用 ageDesc 如 "22岁") + age_desc = getattr(geek_card, 'ageDesc', '') or getattr(geek_data, 'ageDesc', '') + age = self._parse_age_from_desc(age_desc) + + # 解析学历(Boss SDK 使用 geekDegree 或 geekEdu) + education = (getattr(geek_card, 'geekDegree', None) or + getattr(geek_card, 'geekEdu', None) or + getattr(geek_data, 'geekDegree', None)) + + # 解析当前公司(从 geekWorks 或 geekDoneWorks 获取最近的工作经历) + current_company, current_position = self._parse_current_work(geek_card) + + # 解析学校(从 geekEdus 获取最高学历的学校) + school = self._parse_school(geek_card) return Candidate( source=CandidateSource.BOSS, source_id=str(source_id), - name=getattr(geek_data, 'name', ''), + name=name, gender=gender, - age=getattr(geek_data, 'age', None), - location=getattr(geek_data, 'location', None), - current_company=getattr(geek_data, 'company', None), - current_position=getattr(geek_data, 'position', None), + age=age, + location=getattr(geek_card, 'expectLocationName', None) or getattr(geek_data, 'expectLocationName', None), + current_company=current_company, + current_position=current_position, work_years=work_years, - education=getattr(geek_data, 'education', None), - school=getattr(geek_data, 'school', None), + education=education, + school=school, salary_expectation=salary_range, - status=CandidateStatus.NEW + status=CandidateStatus.NEW, + raw_data=raw_data ) def _parse_resume_text(self, resume_text: str) -> ResumeParsed: @@ -205,3 +281,38 @@ class BossCrawler(BaseCrawler): if match: return Decimal(match.group(1)) return None + + def _parse_age_from_desc(self, age_desc: str) -> Optional[int]: + """从年龄描述中解析年龄(如 '22岁' -> 22)""" + if not age_desc: + return None + match = re.search(r'(\d+)', str(age_desc)) + if match: + return int(match.group(1)) + return None + + def _parse_current_work(self, geek_card: Any) -> tuple: + """解析当前工作信息(公司、职位)""" + # 从 geekWorks 或 geekDoneWorks 获取最近的工作经历 + works = getattr(geek_card, 'geekWorks', None) or getattr(geek_card, 'geekDoneWorks', None) + if works and len(works) > 0: + # 取第一个(最近的工作) + latest_work = works[0] + company = getattr(latest_work, 'companyName', None) + position = getattr(latest_work, 'positionName', None) + return company, position + return None, None + + def _parse_school(self, geek_card: Any) -> Optional[str]: + """解析学校信息""" + # 从 geekEdus 获取最高学历的学校 + edus = getattr(geek_card, 'geekEdus', None) + if edus and len(edus) > 0: + # 取第一个(最高的学历) + highest_edu = edus[0] + return getattr(highest_edu, 'schoolName', None) + # 尝试 geekHighestDegreeEdu + highest = getattr(geek_card, 'geekHighestDegreeEdu', None) + if highest: + return getattr(highest, 'schoolName', None) + return None diff --git a/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/ingestion/unified_ingestion_service.py b/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/ingestion/unified_ingestion_service.py index 6462e54..3770a3a 100644 --- a/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/ingestion/unified_ingestion_service.py +++ b/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/ingestion/unified_ingestion_service.py @@ -137,6 +137,12 @@ class UnifiedIngestionService: # 2. 数据验证 validation_result = self._validate(normalized) if not validation_result.is_valid: + # 打印原始数据和标准化数据,方便排查 + print(f"[数据验证失败] 错误: {validation_result.error_messages}") + print(f"[数据验证失败] 原始数据: {raw_data}") + print(f"[数据验证失败] 标准化后候选人: name={normalized.candidate.name}, " + f"source={normalized.candidate.source}, source_id={normalized.candidate.source_id}") + print(f"[数据验证失败] 标准化后简历: raw_content长度={len(normalized.resume.raw_content) if normalized.resume.raw_content else 0}") return IngestionResult.failed_result( errors=validation_result.error_messages, message="数据验证失败" diff --git a/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/scheduler.py b/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/scheduler.py index a152a60..f52d6c1 100644 --- a/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/scheduler.py +++ b/src/main/python/cn/yinlihupo/ylhp_hr_2_0/service/scheduler.py @@ -45,7 +45,7 @@ class CrawlScheduler: # 每30分钟爬取一次 Boss 直聘 self.scheduler.add_job( self._crawl_boss, - trigger=IntervalTrigger(minutes=1), + trigger=IntervalTrigger(seconds=30), id="crawl_boss", name="爬取Boss直聘简历", replace_existing=True