上传文件至「Chat_history」

洋葱聊天记录
This commit is contained in:
2026-04-02 10:40:41 +08:00
commit 8419ed0ffb
13 changed files with 35004 additions and 0 deletions

View File

@@ -0,0 +1,115 @@
from openai import OpenAI
import json
import os
from dotenv import load_dotenv
import re
import pandas as pd
from typing import Dict, Optional
env_path = r"D:\damn\dialogue\.env"
load_dotenv(dotenv_path=env_path)
API_KEY = os.getenv("QWEN_API_KEY")
MODEL = os.getenv("MODEL_NAME")
TEMPERATURE = float(os.getenv("TEMPERATURE", 0.1))
client = OpenAI(
api_key=API_KEY,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
CHAT_RECORD_DIR = "./data/txt"
JSON_OUTPUT_DIR = "./data/json"
FILE_ENCODING = "utf-8"
EXTRACT_FIELDS = [
"监护人1姓名", "家庭角色", "文化程度", "职业", "年龄", "性格特征", "联系方式",
"监护人2姓名", "家庭角色_2", "文化程度_2", "职业_2", "年龄_2", "性格特征_2", "联系方式_2",
"孩子姓名", "性别", "孩子年龄", "年级", "孩子性格特征", "学习成绩",
"家庭地址", "家庭基本情况", "家庭氛围", "亲子关系", "家长有无教育分歧",
"是否经常否定孩子", "有无打骂教育", "孩子是否在父母身边长大",
"还有谁参与孩子的养育", "孩子成长过程中有何重大影响事件", "既往病史",
"孩子的优点", "孩子的缺点", "孩子目前情况的描述", "参加指导最想解决", "问卷评估"
]
EMPTY_RESULT = {field: None for field in EXTRACT_FIELDS}
def clean_llm_response(llm_text: str) -> str:
"""
清洗大模型返回的内容,移除```json/```等标记、多余空格/换行解决JSON解析失败
"""
if not llm_text:
return ""
llm_text = llm_text.strip()
llm_text = llm_text.replace("```json", "").replace("```", "").replace("JSON", "").strip()
llm_text = llm_text.lstrip("{").rstrip("}")
llm_text = "{" + llm_text + "}"
return llm_text
def extract_chat_info(chat_text: str, file_name: str) -> Dict:
"""从聊天记录/登记表中提取字段,返回字典结果"""
prompt = f"""
你是专业的暖洋葱家庭教育档案信息提取分析师,仅从以下文本中提取指定字段信息,严格遵守以下规则,违反规则将判定为任务失败:
1. 提取字段:{', '.join(EXTRACT_FIELDS)},仅提取这些字段,不新增、不删减;
2. 内容规则如实提取无相关信息的字段统一填null绝不猜测、编造内容
3. 格式规则:仅返回**纯JSON字符串**,无任何多余内容,禁止加```json、```、注释、解释、换行;
4. 特殊提取要求:
- 年龄:仅提取数字/带""的数字无则null
- 学习成绩:根据打勾标记提取优秀/良好/一般/差无则null
- 性别:仅男/女无则null
- 联系方式仅纯11位手机号码无则null
- 问卷评估:提取原始计分内容(题目+分数),用分号分隔;
- 多信息用顿号分隔无则null。
待提取文本:
{chat_text[:8000]}
"""
try:
response = client.chat.completions.create(
model=MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=TEMPERATURE,
response_format={"type": "json_object"},
timeout=30
)
raw_resp = response.choices[0].message.content.strip()
clean_resp = clean_llm_response(raw_resp)
extract_json = json.loads(clean_resp)
for field in EXTRACT_FIELDS:
if field not in extract_json:
extract_json[field] = None
result = {field: extract_json[field] for field in EXTRACT_FIELDS}
return result
except json.JSONDecodeError:
print(f"解析失败:{file_name} - 大模型返回非标准JSON返回空结果")
return EMPTY_RESULT
def read_chat_txt(file_path: str) -> Optional[str]:
"""读取TXT处理编码/空文件问题"""
with open(file_path, "r", encoding=FILE_ENCODING) as f:
content = f.read().strip()
if not content or len(content) < 20:
return None
return content
def batch_txt_to_json():
os.makedirs(JSON_OUTPUT_DIR, exist_ok=True)
txt_files = [f for f in os.listdir(CHAT_RECORD_DIR) if f.lower().endswith(".txt")]
success_count = 0
fail_count = 0
for file_name in txt_files:
txt_file_path = os.path.join(CHAT_RECORD_DIR, file_name)
chat_content = read_chat_txt(txt_file_path)
if not chat_content:
fail_count += 1
continue
extract_result = extract_chat_info(chat_content, file_name)
extract_result["文件名称"] = file_name
json_file_name = os.path.splitext(file_name)[0] + ".json"
json_file_path = os.path.join(JSON_OUTPUT_DIR, json_file_name)
with open(json_file_path, "w", encoding="utf-8") as f:
json.dump(extract_result, f, ensure_ascii=False, indent=4)
success_count += 1
if __name__ == "__main__":
batch_txt_to_json()