Merge commit '2c72c000cd4d81febd83e3049c82a9d2174fd41c'

This commit is contained in:
2026-02-27 13:06:39 +08:00
11 changed files with 978 additions and 610 deletions

View File

@@ -9,10 +9,9 @@
--model/ --model/
--__init__.py --__init__.py
--modelling.py --modelling.py
--focal_loss.py
--inference.py # 推理接口 --inference.py # 推理接口
--train.py --train.py
--test.py
--statis_main.py
--Qwen3-1.7B/ --Qwen3-1.7B/
--best_ckpt.pth --best_ckpt.pth
``` ```

View File

@@ -1,20 +1,21 @@
import os import os
import glob import glob
import json import json
from typing import List, Dict
valid_keys = [ valid_keys = [
"Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index", "Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index",
"Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity",
"Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage",
"Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus",
"Competitor_Mindset", "Cognitive_Stage", "Follow_up_Priority", "Last_Interaction", "Referral_Potential" "Competitor_Mindset", "Cognitive_Stage", "Last_Interaction", "Referral_Potential"
] ]
ch_valid_keys = [ ch_valid_keys = [
"核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数",
"社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度",
"沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏",
"低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成",
"竞争者心态", "认知阶段", "跟进优先级", "最后互动时间", "推荐潜力" "竞争者心态", "认知阶段", "最后互动时间", "推荐潜力"
] ]
all_keys = valid_keys + ["session_id", "label"] all_keys = valid_keys + ["session_id", "label"]
en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)}
@@ -22,7 +23,7 @@ d1_keys = valid_keys[:5]
d2_keys = valid_keys[5:10] d2_keys = valid_keys[5:10]
d3_keys = valid_keys[10:15] d3_keys = valid_keys[10:15]
d4_keys = valid_keys[15:19] d4_keys = valid_keys[15:19]
d5_keys = valid_keys[19:23] d5_keys = valid_keys[19:22]
def extract_json_files(folder: str): def extract_json_files(folder: str):
json_files = glob.glob(os.path.join(folder, "*.json")) json_files = glob.glob(os.path.join(folder, "*.json"))
@@ -47,17 +48,17 @@ def try_match_error_key(error_key: str):
else: else:
return None return None
def filt_json_data(json_data: dict): def filt_json_data(json_data: dict, threshold: int = 10):
new_json_data = {} new_json_data = {}
for k, v in json_data.items(): for k, v in json_data.items():
if len(v) >= 10: if len(v) >= threshold and len(v) != 0:
new_json_data[k] = v new_json_data[k] = v
print(f"Total {len(new_json_data)} json keys after filter") print(f"Total {len(new_json_data)} json data after filter with threshold {threshold}")
return new_json_data return new_json_data
def extract_json_data(json_files: list) -> dict: def extract_json_data(json_files: list, threshold: int = 10) -> dict:
data = {} data = {}
for json_file in json_files: for json_file in json_files:
session_id = os.path.basename(json_file).split(".")[0] session_id = os.path.basename(json_file).split(".")[0]
@@ -66,24 +67,37 @@ def extract_json_data(json_files: list) -> dict:
json_data = json.load(f) json_data = json.load(f)
for key, value in json_data.items(): for key, value in json_data.items():
if key in valid_keys: if key in valid_keys:
data[session_id][key] = value['value'] data[session_id][key] = value.get("value", None)
elif key == "Follow_up_Priority":
continue
else: else:
match_key = try_match_error_key(key) match_key = try_match_error_key(key)
if match_key: if match_key:
data[session_id][match_key] = value['value'] data[session_id][match_key] = value.get("value", None)
else: else:
raise ValueError(f"Invalid key {key} in {json_file}") raise ValueError(f"Invalid key {key} in {json_file}")
return filt_json_data(data) return filt_json_data(data, threshold)
def load_data_from_dict(data_dict: List[dict]):
"""
不进行阈值过滤,直接加载数据
"""
data = {}
for idx, item in enumerate(data_dict):
data[idx] = {}
for key, value in item.items():
if key in valid_keys:
data[idx][key] = value.get("value", None)
elif key == "Follow_up_Priority":
continue
else:
match_key = try_match_error_key(key)
if match_key:
data[idx][match_key] = value.get("value", None)
else:
print(f"Warning: Invalid key {key} in data dict, skipped.")
return data
if __name__=="__main__": if __name__=="__main__":
deal_folder = "deal" deal_folder = "deal"
not_deal_folder = "not_deal" not_deal_folder = "not_deal"
deal_json_files = extract_json_files(deal_folder)
deal_data = extract_json_data(deal_json_files)
deal_txt_files = extract_txt_files(deal_folder)
not_deal_json_files = extract_json_files(not_deal_folder)
not_deal_data = extract_json_data(not_deal_json_files)
not_deal_txt_files = extract_txt_files(not_deal_folder)

View File

@@ -1,172 +1,218 @@
import random import random
import numpy as np import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split from torch.utils.data import Dataset, DataLoader, random_split
import torch import torch
from transformers import AutoTokenizer from transformers import AutoTokenizer
from .content_extract import extract_json_files, extract_json_data from .content_extract import extract_json_files, extract_json_data
valid_keys = [ valid_keys = [
"Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index", "Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index",
"Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity",
"Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage",
"Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus",
"Competitor_Mindset", "Cognitive_Stage", "Follow_up_Priority", "Last_Interaction", "Referral_Potential" "Competitor_Mindset", "Cognitive_Stage", "Last_Interaction", "Referral_Potential"
] ]
ch_valid_keys = [ ch_valid_keys = [
"核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数",
"社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度",
"沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏",
"低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成",
"竞争者心态", "认知阶段", "跟进优先级", "最后互动时间", "推荐潜力" "竞争者心态", "认知阶段", "最后互动时间", "推荐潜力"
] ]
all_keys = valid_keys + ["session_id", "label"] all_keys = valid_keys + ["session_id", "label"]
en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)}
d1_keys = valid_keys[:5] d1_keys = valid_keys[:5]
d2_keys = valid_keys[5:10] d2_keys = valid_keys[5:10]
d3_keys = valid_keys[10:15] d3_keys = valid_keys[10:15]
d4_keys = valid_keys[15:19] d4_keys = valid_keys[15:19]
d5_keys = valid_keys[19:23] d5_keys = valid_keys[19:22]
class Formatter: class Formatter:
def __init__(self, en2ch): def __init__(self, en2ch):
self.en2ch = en2ch self.en2ch = en2ch
def _build_user_profile(self, profile: dict) -> str: def _build_user_profile(self, profile: dict) -> str:
sections = [] sections = []
sections.append("[客户画像]") sections.append("[客户画像]")
sections.append("\n [痛感和焦虑等级]") sections.append("\n [痛感和焦虑等级]")
for key in d1_keys: for key in d1_keys:
if key in profile: if key in profile:
sections.append(f"{self.en2ch[key]}: {profile[key]}") if profile[key] is None:
continue
sections.append("\n [支付意愿与能力]") sections.append(f"{self.en2ch[key]}: {profile[key]}")
for key in d2_keys:
if key in profile: sections.append("\n [支付意愿与能力]")
sections.append(f"{self.en2ch[key]}: {profile[key]}") for key in d2_keys:
if key in profile:
sections.append("\n [成交阻力与防御机制]") if profile[key] is None:
for key in d3_keys: continue
if key in profile: sections.append(f"{self.en2ch[key]}: {profile[key]}")
sections.append(f"{self.en2ch[key]}: {profile[key]}")
sections.append("\n [成交阻力与防御机制]")
sections.append("\n [情绪钩子与成交切入点]") for key in d3_keys:
for key in d4_keys: if key in profile:
if key in profile: if profile[key] is None:
sections.append(f"{self.en2ch[key]}: {profile[key]}") continue
sections.append(f"{self.en2ch[key]}: {profile[key]}")
sections.append("\n [客户生命周期状态]")
for key in d5_keys: sections.append("\n [情绪钩子与成交切入点]")
if key in profile: for key in d4_keys:
sections.append(f"{self.en2ch[key]}: {profile[key]}") if key in profile:
return "\n".join(sections) if profile[key] is None:
continue
def get_llm_prompt(self, features): sections.append(f"{self.en2ch[key]}: {profile[key]}")
user_profile = self._build_user_profile(features)
sections.append("\n [客户生命周期状态]")
prompt = f""" for key in d5_keys:
你是一个销售心理学专家,请分析以下客户特征: if key in profile:
if profile[key] is None:
{user_profile} continue
sections.append(f"{self.en2ch[key]}: {profile[key]}")
请提取客户的核心购买驱动力和主要障碍后分析该客户的成交概率。将成交概率以JSON格式输出 return "\n".join(sections)
{{
"conversion_probability": 0-1之间的数值 def get_llm_prompt(self, features: dict) -> list:
}} user_profile = self._build_user_profile(features)
"""
prompt = prompt = f"""
messages = [ 请分析以下客户特征预测成交概率0~1之间
{"role": "user", "content": prompt}
] {user_profile}
return messages
成交概率:
class TransDataset(Dataset): """
def __init__(self, deal_data_folder, not_deal_data_folder):
self.deal_data = extract_json_data(extract_json_files(deal_data_folder)) messages = [
self.not_deal_data = extract_json_data(extract_json_files(not_deal_data_folder)) {"role": "user", "content": prompt}
]
self.formatter = Formatter(en2ch) return messages
num_deal = len(self.deal_data) class TransDataset(Dataset):
num_not_deal = len(self.not_deal_data) def __init__(self, deal_data_folder, not_deal_data_folder, threshold: int = 10, balance: bool = True):
num_threshold = max(num_deal, num_not_deal) * 0.8 self.deal_data = extract_json_data(extract_json_files(deal_data_folder), threshold)
self.not_deal_data = extract_json_data(extract_json_files(not_deal_data_folder), threshold)
if not all([num_deal >= num_threshold, num_not_deal >= num_threshold]):
self._balance_samples() self.formatter = Formatter(en2ch)
self._build_samples() num_deal = len(self.deal_data)
num_not_deal = len(self.not_deal_data)
def _build_samples(self): num_threshold = max(num_deal, num_not_deal) * 0.8
self.samples = []
if not all([num_deal >= num_threshold, num_not_deal >= num_threshold]) and balance:
for id, features in self.deal_data.items(): self._balance_samples()
messages = self.formatter.get_llm_prompt(features)
self.samples.append((id, messages, 1)) self._build_samples()
for id, features in self.not_deal_data.items():
messages = self.formatter.get_llm_prompt(features) def _build_samples(self):
self.samples.append((id, messages, 0)) self.samples = []
random.shuffle(self.samples) for id, features in self.deal_data.items():
print(f"total samples num: {len(self.samples)}, deal num: {len(self.deal_data)}, not deal num: {len(self.not_deal_data)}") messages = self.formatter.get_llm_prompt(features)
self.samples.append((id, messages, 1))
def _balance_samples(self): for id, features in self.not_deal_data.items():
random.seed(42) messages = self.formatter.get_llm_prompt(features)
np.random.seed(42) self.samples.append((id, messages, 0))
not_deal_ids = list(self.not_deal_data.keys()) random.shuffle(self.samples)
target_size = len(self.deal_data) print(f"total samples num: {len(self.samples)}, deal num: {len(self.deal_data)}, not deal num: {len(self.not_deal_data)}")
if len(not_deal_ids) > target_size: def _balance_samples(self):
selected_not_deal_ids = random.sample(not_deal_ids, target_size) random.seed(42)
self.not_deal_data = {sid: self.not_deal_data[sid] for sid in selected_not_deal_ids} np.random.seed(42)
def __len__(self): not_deal_ids = list(self.not_deal_data.keys())
return len(self.samples) target_size = len(self.deal_data)
def __getitem__(self, idx): if len(not_deal_ids) > target_size:
id, prompt, label = self.samples[idx] selected_not_deal_ids = random.sample(not_deal_ids, target_size)
return id, prompt, label self.not_deal_data = {sid: self.not_deal_data[sid] for sid in selected_not_deal_ids}
def build_dataloader(deal_data_folder, not_deal_data_folder, batch_size): def __len__(self):
dataset = TransDataset(deal_data_folder, not_deal_data_folder) return len(self.samples)
num_data = len(dataset)
def __getitem__(self, idx):
train_size = int(0.8 * num_data) id, prompt, label = self.samples[idx]
val_size = int(0.1 * num_data) return id, prompt, label
test_size = num_data - train_size - val_size
class OfflineTransDataset(Dataset):
print(f"train size: {train_size}") def __init__(self, deal_data_folder, not_deal_data_folder, threshold: int = 10):
print(f"val size: {val_size}") self.deal_data = extract_json_data(extract_json_files(deal_data_folder), threshold)
print(f"test size: {test_size}") self.not_deal_data = extract_json_data(extract_json_files(not_deal_data_folder), threshold)
train_dataset, val_dataset, test_dataset = random_split(
dataset, self.formatter = Formatter(en2ch)
[train_size, val_size, test_size], self.samples = []
generator=torch.Generator().manual_seed(42)
) for id, features in self.deal_data.items():
messages = self.formatter.get_llm_prompt(features)
def collate_fn(batch): self.samples.append((id, messages, 1))
ids = [item[0] for item in batch] for id, features in self.not_deal_data.items():
texts = [item[1] for item in batch] messages = self.formatter.get_llm_prompt(features)
labels = torch.tensor([item[2] for item in batch], dtype=torch.long) self.samples.append((id, messages, 0))
return ids, texts, labels
def __len__(self):
train_loader = DataLoader( return len(self.samples)
train_dataset,
batch_size=batch_size, def __getitem__(self, idx):
shuffle=True, id, prompt, label = self.samples[idx]
collate_fn=collate_fn return id, prompt, label
)
val_loader = DataLoader( def build_dataloader(deal_data_folder, not_deal_data_folder, batch_size, threshold: int = 10, balance: bool = True):
val_dataset, dataset = TransDataset(deal_data_folder, not_deal_data_folder, threshold, balance)
batch_size=batch_size, num_data = len(dataset)
shuffle=False,
collate_fn=collate_fn train_size = int(0.8 * num_data)
) val_size = int(0.1 * num_data)
test_loader = DataLoader( test_size = num_data - train_size - val_size
test_dataset,
batch_size=batch_size, print(f"train size: {train_size}")
shuffle=False, print(f"val size: {val_size}")
collate_fn=collate_fn print(f"test size: {test_size}")
) train_dataset, val_dataset, test_dataset = random_split(
return {"train": train_loader, "val": val_loader, "test": test_loader} dataset,
[train_size, val_size, test_size],
generator=torch.Generator().manual_seed(42)
)
def collate_fn(batch):
ids = [item[0] for item in batch]
texts = [item[1] for item in batch]
labels = torch.tensor([item[2] for item in batch], dtype=torch.long)
return ids, texts, labels
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
collate_fn=collate_fn
)
val_loader = DataLoader(
val_dataset,
batch_size=batch_size,
shuffle=False,
collate_fn=collate_fn
)
test_loader = DataLoader(
test_dataset,
batch_size=batch_size,
shuffle=False,
collate_fn=collate_fn
)
return {"train": train_loader, "val": val_loader, "test": test_loader}
def build_offline_dataloader(deal_data_folder, not_deal_data_folder, batch_size, threshold: int = 10):
dataset = OfflineTransDataset(deal_data_folder, not_deal_data_folder, threshold)
def collate_fn(batch):
ids = [item[0] for item in batch]
texts = [item[1] for item in batch]
labels = torch.tensor([item[2] for item in batch], dtype=torch.long)
return ids, texts, labels
offline_loader = DataLoader(
dataset,
batch_size=batch_size,
shuffle=True,
collate_fn=collate_fn
)
return offline_loader

View File

@@ -1,11 +1,10 @@
from model import TransClassifier from model import TransClassifier
from transformers import AutoTokenizer from transformers import AutoTokenizer
from data_process import extract_json_data, Formatter from data_process import extract_json_data, Formatter, load_data_from_dict
import torch import torch
import json import json
from typing import Dict, List, Optional from typing import Dict, List, Optional
import os import os
import random
import warnings import warnings
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
@@ -14,14 +13,14 @@ valid_keys = [
"Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity",
"Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage",
"Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus",
"Competitor_Mindset", "Cognitive_Stage", "Follow_up_Priority", "Last_Interaction", "Referral_Potential" "Competitor_Mindset", "Cognitive_Stage", "Last_Interaction", "Referral_Potential"
] ]
ch_valid_keys = [ ch_valid_keys = [
"核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数",
"社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度",
"沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏",
"低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成",
"竞争者心态", "认知阶段", "跟进优先级", "最后互动时间", "推荐潜力" "竞争者心态", "认知阶段", "最后互动时间", "推荐潜力"
] ]
all_keys = valid_keys + ["session_id", "label"] all_keys = valid_keys + ["session_id", "label"]
en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)}
@@ -29,7 +28,7 @@ d1_keys = valid_keys[:5]
d2_keys = valid_keys[5:10] d2_keys = valid_keys[5:10]
d3_keys = valid_keys[10:15] d3_keys = valid_keys[10:15]
d4_keys = valid_keys[15:19] d4_keys = valid_keys[15:19]
d5_keys = valid_keys[19:23] d5_keys = valid_keys[19:22]
class InferenceEngine: class InferenceEngine:
def __init__(self, backbone_dir: str, ckpt_path: str = "best_ckpt.pth", device: str = "cuda"): def __init__(self, backbone_dir: str, ckpt_path: str = "best_ckpt.pth", device: str = "cuda"):
@@ -42,7 +41,7 @@ class InferenceEngine:
print(f"Tokenizer loaded from {backbone_dir}") print(f"Tokenizer loaded from {backbone_dir}")
# 加载模型 # 加载模型
self.model = TransClassifier(backbone_dir, device) self.model = TransClassifier(backbone_dir, 2, device)
self.model.to(device) self.model.to(device)
if self.ckpt_path: if self.ckpt_path:
self.model.load_state_dict(torch.load(ckpt_path, map_location=device)) self.model.load_state_dict(torch.load(ckpt_path, map_location=device))
@@ -57,25 +56,17 @@ class InferenceEngine:
def inference_batch(self, json_list: List[str]) -> dict: def inference_batch(self, json_list: List[str]) -> dict:
""" """
批量推理函数,输入为 JSON 字符串列表输出为包含转换概率的字典列表。为防止OOM列表最大长度为8。 批量推理函数,输入为 JSON 字符串列表输出为包含转换概率的字典列表。为防止OOM列表最大长度为8。
请注意Json文件中的词条数必须大于等于10. 请注意Json文件中的词条数必须大于等于5.
""" """
# print(111111) assert len(json_list) <= 8, "单次输入json文件数量不可超过8。"
assert len(json_list) <= 10, "单次输入json文件数量不可超过8。" id2feature = extract_json_data(json_files=json_list, threshold=5)
id2feature = extract_json_data(json_list)
print(json.dumps(id2feature ,indent=2 ,ensure_ascii=False))
# id2feature
message_list = [] message_list = []
for id, feature in id2feature.items(): for id, feature in id2feature.items():
messages = self.formatter.get_llm_prompt(feature) messages = self.formatter.get_llm_prompt(feature)
message_list.append(messages) message_list.append(messages)
inputs = self.tokenizer.apply_chat_template( inputs = self.tokenizer.apply_chat_template(message_list, tokenize=False, add_generation_prompt=True, enable_thinking=False)
message_list,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False
)
model_inputs = self.tokenizer( model_inputs = self.tokenizer(
inputs, inputs,
padding=True, padding=True,
@@ -87,21 +78,12 @@ class InferenceEngine:
with torch.inference_mode(): with torch.inference_mode():
with torch.amp.autocast(device_type=self.device, dtype=torch.bfloat16): with torch.amp.autocast(device_type=self.device, dtype=torch.bfloat16):
outputs = self.model(model_inputs) outputs = self.model(model_inputs)
preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist()
# 1. 计算分类标签argmax outputs_float = outputs.float()
preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist() probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2]
probs = probs.cpu().numpy().tolist()
# 2. 计算softmax概率核心修正转CPU、转numpy、转列表解决Tensor序列化问题 probs = [p[1] for p in probs]
outputs_float = outputs.float() # 转换为 float32 避免精度问题 return {"labels": preds, "probs": probs}
probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2]
# 转换为CPU的numpy数组再转列表每个样本对应2个类别的概率
probs = probs.cpu().numpy().tolist()
probs = [p[1] for p in probs] # 只保留类别1的概率
# 3. 计算置信度
confidence = [abs(p - 0.5) * 2 for p in probs]
# 返回格式labels是每个样本的分类标签列表probs是每个样本的类别概率列表confidence是每个样本的置信度列表
return {"labels": preds, "probs": probs, "confidence": confidence}
def inference_sample(self, json_path: str) -> dict: def inference_sample(self, json_path: str) -> dict:
""" """
@@ -110,23 +92,26 @@ class InferenceEngine:
""" """
return self.inference_batch([json_path]) return self.inference_batch([json_path])
def inference( # def inference(
self, # self,
featurs : dict[str ,dict] # featurs : dict[str ,dict]
): # ):
# assert len(featurs) <= 10, "单次输入json文件数量不可超过8。" # # assert len(featurs) <= 10, "单次输入json文件数量不可超过8。"
def inference_batch_json_data(self, json_data: List[dict]) -> dict:
"""
批量推理函数,输入为 JSON 数据输出为包含转换概率的字典列表。为防止OOM列表最大长度为8。
请注意Json文件中的词条数必须大于等于5. 但此处不进行过滤,请注意稍后对输出进行过滤。
"""
assert len(json_data) <= 8, "单次输入json数据数量不可超过8。"
pseudo_id2feature = load_data_from_dict(json_data)
message_list = [] message_list = []
for id, feature in featurs.items(): for id, feature in pseudo_id2feature.items():
messages = self.formatter.get_llm_prompt(feature) messages = self.formatter.get_llm_prompt(feature)
message_list.append(messages) message_list.append(messages)
inputs = self.tokenizer.apply_chat_template( inputs = self.tokenizer.apply_chat_template(message_list, tokenize=False, add_generation_prompt=True, enable_thinking=False)
message_list,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False
)
model_inputs = self.tokenizer( model_inputs = self.tokenizer(
inputs, inputs,
padding=True, padding=True,
@@ -138,26 +123,30 @@ class InferenceEngine:
with torch.inference_mode(): with torch.inference_mode():
with torch.amp.autocast(device_type=self.device, dtype=torch.bfloat16): with torch.amp.autocast(device_type=self.device, dtype=torch.bfloat16):
outputs = self.model(model_inputs) outputs = self.model(model_inputs)
preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist()
# 1. 计算分类标签argmax outputs_float = outputs.float()
preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist() probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2]
probs = probs.cpu().numpy().tolist()
# 2. 计算softmax概率核心修正转CPU、转numpy、转列表解决Tensor序列化问题 probs = [p[1] for p in probs]
outputs_float = outputs.float() # 转换为 float32 避免精度问题 return {"labels": preds, "probs": probs}
probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2]
# 转换为CPU的numpy数组再转列表每个样本对应2个类别的概率
probs = probs.cpu().numpy().tolist()
probs = [p[1] for p in probs] # 只保留类别1的概率
# 3. 计算置信度
confidence = [abs(p - 0.5) * 2 for p in probs]
# 返回格式labels是每个样本的分类标签列表probs是每个样本的类别概率列表confidence是每个样本的置信度列表
return {"labels": preds, "probs": probs, "confidence": confidence}
if __name__ == "__main__": if __name__ == "__main__":
# 配置参数
backbone_dir = "Qwen3-1.7B" backbone_dir = "Qwen3-1.7B"
ckpt_path = "best_ckpt.pth" ckpt_path = "best_ckpt.pth"
device = "cuda" device = "cuda"
engine = InferenceEngine(backbone_dir, ckpt_path, device) engine = InferenceEngine(backbone_dir, ckpt_path, device)
import glob
deal_files = glob.glob(os.path.join("filtered_deal", "*.json"))
test_deal_files = deal_files[:4]
not_deal_files = glob.glob(os.path.join("filtered_not_deal", "*.json"))
test_not_deal_files = not_deal_files[:4]
test_files = test_deal_files + test_not_deal_files
test_dict = []
for test_file in test_files:
with open(test_file, "r", encoding="utf-8") as f:
json_data = json.load(f)
test_dict.append(json_data)
results = engine.inference_batch_json_data(test_dict)
print(results)

View File

@@ -1 +1,2 @@
from .modelling import TransClassifier from .modelling import TransClassifier
from .focal_loss import FocalLoss

135
model/focal_loss.py Normal file
View File

@@ -0,0 +1,135 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
class FocalLoss(nn.Module):
def __init__(self, gamma=2, alpha=None, reduction='mean', task_type='binary', num_classes=None):
"""
Unified Focal Loss class for binary, multi-class, and multi-label classification tasks.
:param gamma: Focusing parameter, controls the strength of the modulating factor (1 - p_t)^gamma
:param alpha: Balancing factor, can be a scalar or a tensor for class-wise weights. If None, no class balancing is used.
:param reduction: Specifies the reduction method: 'none' | 'mean' | 'sum'
:param task_type: Specifies the type of task: 'binary', 'multi-class', or 'multi-label'
:param num_classes: Number of classes (only required for multi-class classification)
"""
super(FocalLoss, self).__init__()
self.gamma = gamma
self.alpha = alpha
self.reduction = reduction
self.task_type = task_type
self.num_classes = num_classes
# Handle alpha for class balancing in multi-class tasks
if task_type == 'multi-class' and alpha is not None and isinstance(alpha, (list, torch.Tensor)):
assert num_classes is not None, "num_classes must be specified for multi-class classification"
if isinstance(alpha, list):
self.alpha = torch.Tensor(alpha)
else:
self.alpha = alpha
def forward(self, inputs, targets):
"""
Forward pass to compute the Focal Loss based on the specified task type.
:param inputs: Predictions (logits) from the model.
Shape:
- binary/multi-label: (batch_size, num_classes)
- multi-class: (batch_size, num_classes)
:param targets: Ground truth labels.
Shape:
- binary: (batch_size,)
- multi-label: (batch_size, num_classes)
- multi-class: (batch_size,)
"""
if self.task_type == 'binary':
return self.binary_focal_loss(inputs, targets)
elif self.task_type == 'multi-class':
return self.multi_class_focal_loss(inputs, targets)
elif self.task_type == 'multi-label':
return self.multi_label_focal_loss(inputs, targets)
else:
raise ValueError(
f"Unsupported task_type '{self.task_type}'. Use 'binary', 'multi-class', or 'multi-label'.")
def binary_focal_loss(self, inputs, targets):
""" Focal loss for binary classification. """
probs = torch.sigmoid(inputs)
targets = targets.float()
# Compute binary cross entropy
bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
# Compute focal weight
p_t = probs * targets + (1 - probs) * (1 - targets)
focal_weight = (1 - p_t) ** self.gamma
# Apply alpha if provided
if self.alpha is not None:
alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
bce_loss = alpha_t * bce_loss
# Apply focal loss weighting
loss = focal_weight * bce_loss
if self.reduction == 'mean':
return loss.mean()
elif self.reduction == 'sum':
return loss.sum()
return loss
def multi_class_focal_loss(self, inputs, targets):
""" Focal loss for multi-class classification. """
if self.alpha is not None:
alpha = self.alpha.to(inputs.device)
# Convert logits to probabilities with softmax
probs = F.softmax(inputs, dim=1)
# One-hot encode the targets
targets_one_hot = F.one_hot(targets, num_classes=self.num_classes).float()
# Compute cross-entropy for each class
ce_loss = -targets_one_hot * torch.log(probs)
# Compute focal weight
p_t = torch.sum(probs * targets_one_hot, dim=1) # p_t for each sample
focal_weight = (1 - p_t) ** self.gamma
# Apply alpha if provided (per-class weighting)
if self.alpha is not None:
alpha_t = alpha.gather(0, targets)
ce_loss = alpha_t.unsqueeze(1) * ce_loss
# Apply focal loss weight
loss = focal_weight.unsqueeze(1) * ce_loss
if self.reduction == 'mean':
return loss.mean()
elif self.reduction == 'sum':
return loss.sum()
return loss
def multi_label_focal_loss(self, inputs, targets):
""" Focal loss for multi-label classification. """
probs = torch.sigmoid(inputs)
# Compute binary cross entropy
bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
# Compute focal weight
p_t = probs * targets + (1 - probs) * (1 - targets)
focal_weight = (1 - p_t) ** self.gamma
# Apply alpha if provided
if self.alpha is not None:
alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
bce_loss = alpha_t * bce_loss
# Apply focal loss weight
loss = focal_weight * bce_loss
if self.reduction == 'mean':
return loss.mean()
elif self.reduction == 'sum':
return loss.sum()
return loss

View File

@@ -1,52 +1,57 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from transformers import AutoModel from transformers import AutoModel
class TransClassifier(nn.Module): class TransClassifier(nn.Module):
def __init__(self, model_dir: str, device: str="cuda"): def __init__(self, model_dir: str, output_classes: int, device: str="cuda"):
super().__init__() super().__init__()
self.backbone = AutoModel.from_pretrained( self.backbone = AutoModel.from_pretrained(
model_dir, model_dir,
dtype = "bfloat16" dtype = "bfloat16",
).to(device).eval() attn_implementation="flash_attention_2"
self.device = device ).to(device).eval()
self.torch_dtype = torch.bfloat16 self.device = device
self.hidden_size = self.backbone.config.hidden_size self.torch_dtype = torch.bfloat16
self.hidden_size = self.backbone.config.hidden_size
self.classifier = nn.Sequential(
nn.LayerNorm(self.hidden_size), self.token_proj = nn.Linear(self.hidden_size, self.hidden_size).to(device=device, dtype=self.torch_dtype)
nn.Linear(self.hidden_size, self.hidden_size//2), self.classifier = nn.Sequential(
nn.GELU(), nn.LayerNorm(self.hidden_size),
nn.Dropout(0.3), nn.Linear(self.hidden_size, self.hidden_size//2),
nn.Linear(self.hidden_size//2, self.hidden_size//4), nn.GELU(),
nn.GELU(), nn.Dropout(0.3),
nn.Dropout(0.2), nn.Linear(self.hidden_size//2, self.hidden_size//4),
nn.Linear(self.hidden_size//4, 2) nn.GELU(),
).to(device=device, dtype=self.torch_dtype) nn.Dropout(0.2),
nn.Linear(self.hidden_size//4, output_classes)
for param in self.backbone.parameters(): ).to(device=device, dtype=self.torch_dtype)
param.requires_grad = False
for param in self.backbone.parameters():
def forward(self, model_inputs: dict): param.requires_grad = False
outputs = self.backbone(**model_inputs)
def forward(self, model_inputs: dict):
last_hidden_state = outputs.last_hidden_state outputs = self.backbone(**model_inputs)
# take last token hidden state proj_states = self.token_proj(outputs.last_hidden_state)
cls_hidden_state = last_hidden_state[:, -1, :]
attention_mask = model_inputs['attention_mask']
logits = self.classifier(cls_hidden_state) mask_expanded = attention_mask.unsqueeze(-1).expand_as(proj_states).to(proj_states.dtype)
return logits sum_states = (proj_states * mask_expanded).sum(dim=1)
valid_tokens = mask_expanded.sum(dim=1)
if __name__ == "__main__": pooled = sum_states / valid_tokens.clamp(min=1e-9)
model_dir = r"C:\Users\GA\Desktop\models\Qwen3-1.7B"
device = "cuda" logits = self.classifier(pooled)
model = TransClassifier(model_dir, device) return logits
print(model.hidden_size)
print(model) if __name__ == "__main__":
model_dir = r"C:\Users\GA\Desktop\models\Qwen3-1.7B"
total_params = sum(p.numel() for p in model.parameters()) device = "cuda"
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) model = TransClassifier(model_dir, device)
print(model.hidden_size)
print(f"总参数量: {total_params:,}") print(model)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"总参数量: {total_params:,}")
print(f"可训练参数量: {trainable_params:,}") print(f"可训练参数量: {trainable_params:,}")

130
offline_test.py Normal file
View File

@@ -0,0 +1,130 @@
from data_process import build_offline_dataloader
from model import TransClassifier
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
import json
import gc
from tqdm import tqdm
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from transformers import AutoTokenizer
warnings.filterwarnings("ignore")
def offline_test(
deal_data_folder, not_deal_data_folder, batch_size, threshold,
backbone_dir, ckpt_path, device, filtered=False
):
offline_loader = build_offline_dataloader(deal_data_folder, not_deal_data_folder, batch_size, threshold)
tokenizer = AutoTokenizer.from_pretrained(backbone_dir)
model = TransClassifier(backbone_dir, 2, device)
model.load_state_dict(torch.load(ckpt_path, map_location=device))
model.eval()
all_ids = []
all_preds = []
all_probs = []
all_labels = []
pbar = tqdm(offline_loader, desc="Testing")
with torch.inference_mode():
for batch_idx, (ids, texts, labels) in enumerate(pbar):
all_ids.extend(ids)
labels = labels.to(device)
texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False)
inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device)
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(inputs)
preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist()
outputs_float = outputs.float() # 转换为 float32 避免精度问题
probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2]
probs = probs.cpu().numpy().tolist()
probs = [p[1] for p in probs]
all_preds.extend(preds)
all_probs.extend(probs)
all_labels.extend(labels.cpu().numpy())
# 清理内存
del texts, labels, outputs
torch.cuda.empty_cache()
gc.collect()
# 计算评估指标
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="weighted")
recall = recall_score(all_labels, all_preds, average="weighted")
f1 = f1_score(all_labels, all_preds, average="weighted")
auc = roc_auc_score(all_labels, all_probs)
cm = confusion_matrix(all_labels, all_preds)
precision_per_class = precision_score(all_labels, all_preds, average=None)
recall_per_class = recall_score(all_labels, all_preds, average=None)
f1_per_class = f1_score(all_labels, all_preds, average=None)
test_results = {
"accuracy": accuracy,
"precision_weighted": precision,
"recall_weighted": recall,
"f1_weighted": f1,
"auc": auc,
"confusion_matrix": cm.tolist(),
"class_0_precision": precision_per_class[0],
"class_0_recall": recall_per_class[0],
"class_0_f1": f1_per_class[0],
"class_1_precision": precision_per_class[1],
"class_1_recall": recall_per_class[1],
"class_1_f1": f1_per_class[1]
}
if filtered:
with open(f"offline_test_result_filtered_{threshold}.json", "w", encoding="utf-8") as f:
json.dump(test_results, f, ensure_ascii=False, indent=4)
else:
with open(f"offline_test_result_{threshold}.json", "w", encoding="utf-8") as f:
json.dump(test_results, f, ensure_ascii=False, indent=4)
pred_df = pd.DataFrame({
"ids": all_ids,
"predictions": all_preds,
"probability": all_probs,
"true_labels": all_labels
})
if filtered:
pred_df.to_csv(f"offline_test_predictions_filtered_{threshold}.csv", index=False, encoding="utf-8")
else:
pred_df.to_csv(f"offline_test_predictions_{threshold}.csv", index=False, encoding="utf-8")
if __name__=="__main__":
filtered_deal_folder = "not_trained_deal_filtered"
filtered_not_deal_folder = "not_trained_not_deal_filtered"
deal_folder = "not_trained_deal"
not_deal_folder = "not_trained_not_deal"
batch_size = 8
backbone_dir = "Qwen3-1.7B"
""" for i in range(3, 9):
threshold = i
ckpt_path = f"best_ckpt_threshold_{threshold}.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
offline_test(deal_folder, not_deal_folder, batch_size, threshold, backbone_dir, ckpt_path, device, filtered=False)
offline_test(filtered_deal_folder, filtered_not_deal_folder, batch_size, threshold, backbone_dir, ckpt_path, device, filtered=True) """
threshold = 5
ckpt_path = f"best_ckpt_threshold_{threshold}_1st.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
offline_test(deal_folder, not_deal_folder, batch_size, threshold, backbone_dir, ckpt_path, device, filtered=False)
offline_test(filtered_deal_folder, filtered_not_deal_folder, batch_size, threshold, backbone_dir, ckpt_path, device, filtered=True)

154
test.py
View File

@@ -1,154 +0,0 @@
from data_process import build_dataloader
from model import TransClassifier
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime
import gc
from tqdm import tqdm
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
warnings.filterwarnings("ignore")
def test(backbone_dir, deal_folder, not_deal_folder, batch_size, ckpt_path="best_ckpt.pth", device="cuda"):
"""
测试模型在测试集上的表现
Args:
backbone_dir: 预训练模型目录
deal_folder: 成交数据文件夹
not_deal_folder: 非成交数据文件夹
batch_size: 批量大小
ckpt_path: 模型 checkpoint 路径
device: 运行设备
"""
# 加载测试数据
data_dict = build_dataloader(deal_folder, not_deal_folder, batch_size)
test_loader = data_dict["test"]
print(f"Test data loaded successfully. Test samples: {len(test_loader.dataset)}")
# 加载 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained(backbone_dir)
model = TransClassifier(backbone_dir, device)
model.to(device)
# 加载训练好的模型权重
if os.path.exists(ckpt_path):
model.load_state_dict(torch.load(ckpt_path, map_location=device))
print(f"Model loaded from {ckpt_path}")
else:
print(f"Warning: {ckpt_path} not found. Using untrained model.")
# 测试模型
model.eval()
all_ids = []
all_preds = []
all_labels = []
test_loss = 0.0
loss_func = nn.CrossEntropyLoss()
pbar = tqdm(test_loader, desc="Testing")
with torch.inference_mode():
for batch_idx, (ids, texts, labels) in enumerate(pbar):
all_ids.extend(ids)
labels = labels.to(device)
# 处理输入数据
texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False)
inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device)
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(inputs)
loss = loss_func(outputs, labels)
test_loss += loss.item()
# 计算预测结果
preds = torch.argmax(outputs, dim=1).cpu().numpy()
all_preds.extend(preds)
all_labels.extend(labels.cpu().numpy())
# 清理内存
del texts, labels, outputs, loss
torch.cuda.empty_cache()
gc.collect()
# 计算评估指标
avg_loss = test_loss / len(test_loader)
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="weighted")
recall = recall_score(all_labels, all_preds, average="weighted")
f1 = f1_score(all_labels, all_preds, average="weighted")
cm = confusion_matrix(all_labels, all_preds)
# 打印评估结果
print("\n=== Test Results ===")
print(f"Average Loss: {avg_loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)
print("\n=== Class-wise Metrics ===")
print("Class 0 (Not Deal):")
print(f" Precision: {precision_score(all_labels, all_preds, average=None)[0]:.4f}")
print(f" Recall: {recall_score(all_labels, all_preds, average=None)[0]:.4f}")
print(f" F1 Score: {f1_score(all_labels, all_preds, average=None)[0]:.4f}")
print("\nClass 1 (Deal):")
print(f" Precision: {precision_score(all_labels, all_preds, average=None)[1]:.4f}")
print(f" Recall: {recall_score(all_labels, all_preds, average=None)[1]:.4f}")
print(f" F1 Score: {f1_score(all_labels, all_preds, average=None)[1]:.4f}")
# 保存测试结果
test_results = {
"average_loss": avg_loss,
"accuracy": accuracy,
"precision": precision,
"recall": recall,
"f1_score": f1,
"confusion_matrix": cm.tolist(),
"class_0_precision": precision_score(all_labels, all_preds, average=None)[0],
"class_0_recall": recall_score(all_labels, all_preds, average=None)[0],
"class_0_f1": f1_score(all_labels, all_preds, average=None)[0],
"class_1_precision": precision_score(all_labels, all_preds, average=None)[1],
"class_1_recall": recall_score(all_labels, all_preds, average=None)[1],
"class_1_f1": f1_score(all_labels, all_preds, average=None)[1],
"test_samples": len(all_labels),
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
# 保存预测结果
pred_results = {
"ids": all_ids,
"predictions": all_preds,
"true_labels": all_labels
}
pred_df = pd.DataFrame(pred_results)
pred_df.to_csv("test_predictions.csv", index=False, encoding="utf-8")
# 保存为 JSON 文件
with open("test_results.json", "w", encoding="utf-8") as f:
json.dump(test_results, f, ensure_ascii=False, indent=2)
print("\nTest results saved to test_results.json")
return test_results
if __name__ == "__main__":
# 配置参数
backbone_dir = r"C:\Users\GA\Desktop\models\Qwen3-1.7B"
deal_folder = "deal"
not_deal_folder = "not_deal"
batch_size = 8
ckpt_path = "best_ckpt.pth"
device = "cuda"
# 运行测试
test(backbone_dir, deal_folder, not_deal_folder, batch_size, ckpt_path, device)

472
train.py
View File

@@ -1,149 +1,323 @@
from data_process import build_dataloader from data_process import build_dataloader
from model import TransClassifier from model import TransClassifier, FocalLoss
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from transformers import AutoTokenizer from transformers import AutoTokenizer
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import os import os
import json import json
from datetime import datetime from datetime import datetime
import gc import gc
from tqdm import tqdm from tqdm import tqdm
import warnings import warnings
warnings.filterwarnings("ignore") from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
warnings.filterwarnings("ignore")
class EarlyStopping:
def __init__(self, patience=5, delta=0, path='checkpoint.pt'): class EarlyStopping:
self.patience = patience def __init__(self, patience=5, delta=0):
self.counter = 0 self.patience = patience
self.best_score = None self.counter = 0
self.early_stop = False self.best_score = None
self.val_loss_min = np.inf self.early_stop = False
self.delta = delta self.val_loss_min = np.inf
self.path = path self.delta = delta
def __call__(self, val_loss, model): def __call__(self, val_loss, model, best_ckpt_path):
score = -val_loss score = -val_loss
if self.best_score is None: if self.best_score is None:
self.best_score = score self.best_score = score
self.save_checkpoint(val_loss, model) self.save_checkpoint(val_loss, model, best_ckpt_path)
elif score < self.best_score + self.delta:
self.counter += 1 elif score < self.best_score + self.delta:
print(f'EarlyStopping counter: {self.counter} out of {self.patience}') self.counter += 1
if self.counter >= self.patience: print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
self.early_stop = True if self.counter >= self.patience:
else: self.early_stop = True
self.best_score = score else:
self.save_checkpoint(val_loss, model) self.best_score = score
self.counter = 0 self.save_checkpoint(val_loss, model, best_ckpt_path)
self.counter = 0
def save_checkpoint(self, val_loss, model):
print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model...') def save_checkpoint(self, val_loss, model, best_ckpt_path):
torch.save(model.state_dict(), self.path) print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model...')
self.val_loss_min = val_loss torch.save(model.state_dict(), best_ckpt_path)
self.val_loss_min = val_loss
def train(backbone_dir, deal_folder, not_deal_folder,
batch_size, initial_lr=1e-5, max_epochs=100, def train(backbone_dir, deal_folder, not_deal_folder,
best_ckpt_path="best_ckpt.pth", final_ckpt_path="final_ckpt.pth", device="cuda"): batch_size, initial_lr=1e-5, max_epochs=100, threshold: int = 10,
device="cuda", use_focal_loss=False, balance=True):
data_dict = build_dataloader(deal_folder, not_deal_folder, batch_size) best_ckpt_path = f"best_ckpt_threshold_{threshold}.pth"
train_loader = data_dict["train"]
val_loader = data_dict["val"] tokenizer = AutoTokenizer.from_pretrained(backbone_dir)
if use_focal_loss:
tokenizer = AutoTokenizer.from_pretrained(backbone_dir) model = TransClassifier(backbone_dir, output_classes=1, device=device)
model = TransClassifier(backbone_dir, device) if balance:
model.to(device) loss_func = FocalLoss(
gamma=2.0,
optimizer = torch.optim.AdamW(model.parameters(), lr=initial_lr) alpha=0.5,
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) reduction='mean',
task_type='binary')
loss_func = nn.CrossEntropyLoss() else:
loss_func = FocalLoss(
early_stopping = EarlyStopping(path=best_ckpt_path) gamma=2.0,
history = {"train_loss": [], "val_loss": [], "epoch": []} alpha=0.8,
reduction='mean',
for epoch in range(max_epochs): task_type='binary')
model.train() else:
total_loss = 0.0 model = TransClassifier(backbone_dir, output_classes=2, device=device)
train_steps = 0 loss_func = nn.CrossEntropyLoss()
assert balance == True, "When not using CE loss, balance must be True."
if epoch == 2: model.to(device)
for param in model.backbone.parameters():
param.requires_grad = True data_dict = build_dataloader(deal_data_folder=deal_folder, not_deal_data_folder=not_deal_folder, batch_size=batch_size, threshold=threshold, balance=balance)
print("Unfreeze backbone parameters") train_loader = data_dict["train"]
val_loader = data_dict["val"]
pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{max_epochs} [Train]') test_loader = data_dict["test"]
for batch_idx, (ids, texts, labels) in enumerate(pbar):
labels = labels.to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=initial_lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False)
inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) early_stopping = EarlyStopping(patience=10, delta=0)
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): history = {"train_loss": [], "val_loss": [], "epoch": []}
outputs = model(inputs)
loss = loss_func(outputs, labels) for epoch in range(max_epochs):
model.train()
optimizer.zero_grad() total_loss = 0.0
loss.backward() train_steps = 0
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step() if epoch == 2:
scheduler.step() for param in model.backbone.parameters():
param.requires_grad = True
total_loss += loss.item() print("Unfreeze backbone parameters")
train_steps += 1
pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{max_epochs} [Train]')
train_loss = total_loss / train_steps for batch_idx, (ids, texts, labels) in enumerate(pbar):
pbar.set_postfix({"train_loss": train_loss}) labels = labels.to(device)
del texts, labels, outputs, loss texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False)
torch.cuda.empty_cache() inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device)
gc.collect()
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
val_loss = val(val_loader, model, loss_func, tokenizer, device) outputs = model(inputs)
history["train_loss"].append(total_loss / len(train_loader)) if use_focal_loss:
history["val_loss"].append(val_loss) outputs = outputs.squeeze(1)
history["epoch"].append(epoch+1) loss = loss_func(outputs, labels.float())
else:
print(f"Epoch {epoch+1}/{max_epochs}, Train Loss: {total_loss / len(train_loader):.4f}, Val Loss: {val_loss:.4f}") loss = loss_func(outputs, labels)
early_stopping(val_loss, model) optimizer.zero_grad()
if early_stopping.early_stop: loss.backward()
print("Early stopping") nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
break optimizer.step()
scheduler.step()
torch.save(model.state_dict(), final_ckpt_path)
print(f"Final model saved to {final_ckpt_path}") total_loss += loss.item()
train_steps += 1
history_df = pd.DataFrame(history)
history_df.to_csv("training_history.csv", index=False) train_loss = total_loss / train_steps
print("Training history saved to training_history.csv") pbar.set_postfix({"train_loss": train_loss})
def val(val_loader, model, loss_func, tokenizer, device): del texts, labels, outputs, loss
model.eval() torch.cuda.empty_cache()
val_loss = 0.0 gc.collect()
with torch.no_grad():
for batch_idx, (ids, texts, labels) in enumerate(val_loader): val_loss = val(val_loader, model, loss_func, tokenizer, device, use_focal_loss)
labels = labels.to(device) train_loss_epoch = total_loss / len(train_loader)
history["train_loss"].append(train_loss_epoch)
texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False) history["val_loss"].append(val_loss)
inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) history["epoch"].append(epoch+1)
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(inputs) print(f"Epoch {epoch+1}/{max_epochs}, Train Loss: {total_loss / len(train_loader):.4f}, Val Loss: {val_loss:.4f}")
loss = loss_func(outputs, labels)
early_stopping(val_loss, model, best_ckpt_path)
val_loss += loss.item() if early_stopping.early_stop:
return val_loss / len(val_loader) print("Early stopping")
break
if __name__ == "__main__":
backbone_dir = r"C:\Users\GA\Desktop\models\Qwen3-1.7B" history_df = pd.DataFrame(history)
deal_folder = "deal" history_df.to_csv(f"training_history_threshold_{threshold}.csv", index=False)
not_deal_folder = "not_deal" print(f"Training history saved to training_history_threshold_{threshold}.csv")
batch_size = 8 return test_loader
device = "cuda"
def val(val_loader, model, loss_func, tokenizer, device, use_focal_loss=False):
model.eval()
train(backbone_dir, deal_folder, not_deal_folder, batch_size, device=device) val_loss = 0.0
with torch.no_grad():
for batch_idx, (ids, texts, labels) in enumerate(val_loader):
labels = labels.to(device)
texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False)
inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device)
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(inputs)
if use_focal_loss:
outputs = outputs.squeeze(1)
loss = loss_func(outputs, labels.float())
else:
loss = loss_func(outputs, labels)
val_loss += loss.item()
del inputs, outputs, labels, loss
gc.collect()
torch.cuda.empty_cache()
return val_loss / len(val_loader)
def test(backbone_dir, test_loader, device, threshold, use_focal_loss=False, balance=True):
tokenizer = AutoTokenizer.from_pretrained(backbone_dir)
if use_focal_loss:
model = TransClassifier(backbone_dir, output_classes=1, device=device)
else:
model = TransClassifier(backbone_dir, output_classes=2, device=device)
model.to(device)
ckpt_path = f"best_ckpt_threshold_{threshold}.pth"
if os.path.exists(ckpt_path):
model.load_state_dict(torch.load(ckpt_path, map_location=device))
print(f"Model loaded from {ckpt_path}")
else:
print(f"Warning: {ckpt_path} not found. Using untrained model.")
model.eval()
all_ids = []
all_preds = []
all_probs = []
all_labels = []
pbar = tqdm(test_loader, desc="Testing")
with torch.inference_mode():
for batch_idx, (ids, texts, labels) in enumerate(pbar):
all_ids.extend(ids)
labels = labels.to(device)
texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False)
inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device)
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
outputs = model(inputs)
if use_focal_loss:
outputs = outputs.squeeze(-1) # [B, 1] -> [B]
if use_focal_loss:
outputs_float = outputs.float() # 转换为 float32 避免精度问题
probs = torch.sigmoid(outputs_float).cpu().numpy().tolist() # [B]
preds = [1 if p >= 0.5 else 0 for p in probs]
else:
preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist()
outputs_float = outputs.float() # 转换为 float32 避免精度问题
probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2]
probs = probs.cpu().numpy().tolist()
probs = [p[1] for p in probs]
all_preds.extend(preds)
all_probs.extend(probs)
all_labels.extend(labels.cpu().numpy())
# 清理内存
del texts, labels, outputs
torch.cuda.empty_cache()
gc.collect()
# 计算评估指标
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="weighted")
recall = recall_score(all_labels, all_preds, average="weighted")
f1 = f1_score(all_labels, all_preds, average="weighted")
auc = roc_auc_score(all_labels, all_probs)
cm = confusion_matrix(all_labels, all_preds)
# 打印评估结果
print("\n=== Test Results ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
cm_df = pd.DataFrame(cm,
index=['Actual Not Deal (0)', 'Actual Deal (1)'],
columns=['Predicted Not Deal (0)', 'Predicted Deal (1)'])
print("\nConfusion Matrix:")
print(cm_df)
precision_per_class = precision_score(all_labels, all_preds, average=None)
recall_per_class = recall_score(all_labels, all_preds, average=None)
f1_per_class = f1_score(all_labels, all_preds, average=None)
print("\n=== Class-wise Metrics ===")
print("Class 0 (Not Deal):")
print(f" Precision: {precision_per_class[0]:.4f}")
print(f" Recall: {recall_per_class[0]:.4f}")
print(f" F1 Score: {f1_per_class[0]:.4f}")
print("\nClass 1 (Deal):")
print(f" Precision: {precision_per_class[1]:.4f}")
print(f" Recall: {recall_per_class[1]:.4f}")
print(f" F1 Score: {f1_per_class[1]:.4f}")
test_results = {
"accuracy": accuracy,
"precision_weighted": precision,
"recall_weighted": recall,
"f1_weighted": f1,
"auc": auc,
"confusion_matrix": cm.tolist(),
"class_0_precision": precision_per_class[0],
"class_0_recall": recall_per_class[0],
"class_0_f1": f1_per_class[0],
"class_1_precision": precision_per_class[1],
"class_1_recall": recall_per_class[1],
"class_1_f1": f1_per_class[1],
"test_samples": len(all_labels)
}
with open(f"test_results_threshold_{threshold}.json", "w", encoding="utf-8") as f:
json.dump(test_results, f, ensure_ascii=False, indent=4)
print(f"\nTest results saved to test_results_threshold_{threshold}.json")
pred_df = pd.DataFrame({
"ids": all_ids,
"predictions": all_preds,
"probability": all_probs,
"true_labels": all_labels
})
pred_df.to_csv(f"test_predictions_threshold_{threshold}.csv", index=False, encoding="utf-8")
if __name__ == "__main__":
backbone_dir = "Qwen3-1.7B"
deal_folder = "filtered_deal"
not_deal_folder = "filtered_not_deal"
batch_size = 4
device = "cuda"
""" threshold = 10
test_loader = train(backbone_dir=backbone_dir, deal_folder=deal_folder, not_deal_folder=not_deal_folder, batch_size=batch_size, threshold=threshold, device=device, use_focal_loss=False, balance=True)
test(
backbone_dir=backbone_dir,
test_loader=test_loader,
device=device,
threshold=threshold,
use_focal_loss=False,
balance=True
) """
max_threshold = 10
for i in range(3, 9):
print(f"Training with threshold {i}...")
test_loader = train(backbone_dir=backbone_dir, deal_folder=deal_folder, not_deal_folder=not_deal_folder, batch_size=batch_size, threshold=i, device=device, use_focal_loss=False, balance=True)
test(
backbone_dir=backbone_dir,
test_loader=test_loader,
device=device,
threshold=i,
use_focal_loss=False,
balance=True
)

29
visualize_training.py Normal file
View File

@@ -0,0 +1,29 @@
import pandas as pd
import matplotlib.pyplot as plt
def visualize_training_history(threshold):
csv_path = f'training_history_threshold_{threshold}.csv'
df = pd.read_csv(csv_path)
epochs = df['epoch']
train_loss = df['train_loss']
val_loss = df['val_loss']
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_loss, 'b-', label='Training Loss')
plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
plt.title('Training and Validation Loss Over Epochs (CE)')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.savefig(f'training_visualization_threshold_{threshold}.png')
plt.show()
print(f'可视化完成,图表已保存为 training_visualization_threshold_{threshold}.png')
if __name__ == "__main__":
for i in range(11):
visualize_training_history(i)