Implement feature extraction and inference enhancements, including async processing and MongoDB integration

2026-01-30 14:43:30 +08:00
parent 3d78b88d47
commit 1bd4547b99
7 changed files with 166 additions and 3 deletions
--- a/feature_extraction.py
+++ b/feature_extraction.py
@@ -391,6 +391,46 @@ def process_single_txt(file_path, output_dir=OUTPUT_DIR):
        raise RuntimeError(f"保存JSON失败：{str(e)}") from e


+async def process_single(content : str):
+    # 3. 构建提示词并调用API
+    prompt = build_extraction_prompt(content)
+    try:
+        print("正在调用API提取特征...")
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=TEMPERATURE,
+            max_tokens=8000,
+            timeout=30
+        )
+    except Exception as e:
+        raise RuntimeError(f"API调用失败：{str(e)}") from e
+
+    # 4. 提取并清洗JSON
+    feature_json_str = response.choices[0].message.content.strip()
+    json_match = re.search(r"\{[\s\S]*\}", feature_json_str)
+    if not json_match:
+        raise RuntimeError(f"API返回无有效JSON：{feature_json_str[:200]}...")
+    cleaned_json = clean_and_fix_json(json_match.group())
+
+    # 5. 解析并验证JSON
+    try:
+        parsed_dict = json.loads(cleaned_json)
+    except json.JSONDecodeError as e:
+        raise RuntimeError(f"JSON解析失败：{str(e)} | 清洗后内容：{cleaned_json[:500]}") from e
+
+    # 验证核心字段
+    if "Follow_up_Priority" not in parsed_dict:
+        raise RuntimeError("核心字段Follow_up_Priority缺失")
+    fu_prio = parsed_dict["Follow_up_Priority"]
+    if not isinstance(fu_prio, dict) or "value" not in fu_prio or "evidence" not in fu_prio:
+        raise RuntimeError("Follow_up_Priority格式错误（需包含value和evidence）")
+    if not isinstance(fu_prio["evidence"], list):
+        raise RuntimeError("evidence必须是数组类型")
+    if len(str(fu_prio["value"])) >= 20:
+        raise RuntimeError(f"value超20字限制：{fu_prio['value']}")
+    
+    return parsed_dict

 if __name__ == "__main__":
    # 要处理的源文件夹路径