Files
yangcong_data/Chat_history/yangcong_csv_data.py
2026-04-02 10:40:41 +08:00

115 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from openai import OpenAI
import json
import os
from dotenv import load_dotenv
import re
import pandas as pd
from typing import Dict, Optional
env_path = r"D:\damn\dialogue\.env"
load_dotenv(dotenv_path=env_path)
API_KEY = os.getenv("QWEN_API_KEY")
MODEL = os.getenv("MODEL_NAME")
TEMPERATURE = float(os.getenv("TEMPERATURE", 0.1))
client = OpenAI(
api_key=API_KEY,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
CHAT_RECORD_DIR = "./data/txt"
JSON_OUTPUT_DIR = "./data/json"
FILE_ENCODING = "utf-8"
EXTRACT_FIELDS = [
"监护人1姓名", "家庭角色", "文化程度", "职业", "年龄", "性格特征", "联系方式",
"监护人2姓名", "家庭角色_2", "文化程度_2", "职业_2", "年龄_2", "性格特征_2", "联系方式_2",
"孩子姓名", "性别", "孩子年龄", "年级", "孩子性格特征", "学习成绩",
"家庭地址", "家庭基本情况", "家庭氛围", "亲子关系", "家长有无教育分歧",
"是否经常否定孩子", "有无打骂教育", "孩子是否在父母身边长大",
"还有谁参与孩子的养育", "孩子成长过程中有何重大影响事件", "既往病史",
"孩子的优点", "孩子的缺点", "孩子目前情况的描述", "参加指导最想解决", "问卷评估"
]
EMPTY_RESULT = {field: None for field in EXTRACT_FIELDS}
def clean_llm_response(llm_text: str) -> str:
"""
清洗大模型返回的内容,移除```json/```等标记、多余空格/换行解决JSON解析失败
"""
if not llm_text:
return ""
llm_text = llm_text.strip()
llm_text = llm_text.replace("```json", "").replace("```", "").replace("JSON", "").strip()
llm_text = llm_text.lstrip("{").rstrip("}")
llm_text = "{" + llm_text + "}"
return llm_text
def extract_chat_info(chat_text: str, file_name: str) -> Dict:
"""从聊天记录/登记表中提取字段,返回字典结果"""
prompt = f"""
你是专业的暖洋葱家庭教育档案信息提取分析师,仅从以下文本中提取指定字段信息,严格遵守以下规则,违反规则将判定为任务失败:
1. 提取字段:{', '.join(EXTRACT_FIELDS)},仅提取这些字段,不新增、不删减;
2. 内容规则如实提取无相关信息的字段统一填null绝不猜测、编造内容
3. 格式规则:仅返回**纯JSON字符串**,无任何多余内容,禁止加```json、```、注释、解释、换行;
4. 特殊提取要求:
- 年龄:仅提取数字/带""的数字无则null
- 学习成绩:根据打勾标记提取优秀/良好/一般/差无则null
- 性别:仅男/女无则null
- 联系方式仅纯11位手机号码无则null
- 问卷评估:提取原始计分内容(题目+分数),用分号分隔;
- 多信息用顿号分隔无则null。
待提取文本:
{chat_text[:8000]}
"""
try:
response = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=TEMPERATURE,
response_format={"type": "json_object"},
timeout=30
)
raw_resp = response.choices[0].message.content.strip()
clean_resp = clean_llm_response(raw_resp)
extract_json = json.loads(clean_resp)
for field in EXTRACT_FIELDS:
if field not in extract_json:
extract_json[field] = None
result = {field: extract_json[field] for field in EXTRACT_FIELDS}
return result
except json.JSONDecodeError:
print(f"解析失败:{file_name} - 大模型返回非标准JSON返回空结果")
return EMPTY_RESULT
def read_chat_txt(file_path: str) -> Optional[str]:
"""读取TXT处理编码/空文件问题"""
with open(file_path, "r", encoding=FILE_ENCODING) as f:
content = f.read().strip()
if not content or len(content) < 20:
return None
return content
def batch_txt_to_json():
os.makedirs(JSON_OUTPUT_DIR, exist_ok=True)
txt_files = [f for f in os.listdir(CHAT_RECORD_DIR) if f.lower().endswith(".txt")]
success_count = 0
fail_count = 0
for file_name in txt_files:
txt_file_path = os.path.join(CHAT_RECORD_DIR, file_name)
chat_content = read_chat_txt(txt_file_path)
if not chat_content:
fail_count += 1
continue
extract_result = extract_chat_info(chat_content, file_name)
extract_result["文件名称"] = file_name
json_file_name = os.path.splitext(file_name)[0] + ".json"
json_file_path = os.path.join(JSON_OUTPUT_DIR, json_file_name)
with open(json_file_path, "w", encoding="utf-8") as f:
json.dump(extract_result, f, ensure_ascii=False, indent=4)
success_count += 1
if __name__ == "__main__":
batch_txt_to_json()