Implement feature extraction and inference enhancements, including async processing and MongoDB integration

This commit is contained in:
2026-01-30 14:43:30 +08:00
parent 3d78b88d47
commit 1bd4547b99
7 changed files with 166 additions and 3 deletions

View File

@@ -391,6 +391,46 @@ def process_single_txt(file_path, output_dir=OUTPUT_DIR):
raise RuntimeError(f"保存JSON失败{str(e)}") from e
async def process_single(content : str):
# 3. 构建提示词并调用API
prompt = build_extraction_prompt(content)
try:
print("正在调用API提取特征...")
response = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=TEMPERATURE,
max_tokens=8000,
timeout=30
)
except Exception as e:
raise RuntimeError(f"API调用失败{str(e)}") from e
# 4. 提取并清洗JSON
feature_json_str = response.choices[0].message.content.strip()
json_match = re.search(r"\{[\s\S]*\}", feature_json_str)
if not json_match:
raise RuntimeError(f"API返回无有效JSON{feature_json_str[:200]}...")
cleaned_json = clean_and_fix_json(json_match.group())
# 5. 解析并验证JSON
try:
parsed_dict = json.loads(cleaned_json)
except json.JSONDecodeError as e:
raise RuntimeError(f"JSON解析失败{str(e)} | 清洗后内容:{cleaned_json[:500]}") from e
# 验证核心字段
if "Follow_up_Priority" not in parsed_dict:
raise RuntimeError("核心字段Follow_up_Priority缺失")
fu_prio = parsed_dict["Follow_up_Priority"]
if not isinstance(fu_prio, dict) or "value" not in fu_prio or "evidence" not in fu_prio:
raise RuntimeError("Follow_up_Priority格式错误需包含value和evidence")
if not isinstance(fu_prio["evidence"], list):
raise RuntimeError("evidence必须是数组类型")
if len(str(fu_prio["value"])) >= 20:
raise RuntimeError(f"value超20字限制{fu_prio['value']}")
return parsed_dict
if __name__ == "__main__":
# 要处理的源文件夹路径