From fe942c023d61a506fdff40cff5d2f87e51f59257 Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:36:12 +0800 Subject: [PATCH 01/12] Update data_process/process/content_extract.py --- data_process/process/content_extract.py | 30 ++++++++++--------------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/data_process/process/content_extract.py b/data_process/process/content_extract.py index da959e0..cd05e29 100644 --- a/data_process/process/content_extract.py +++ b/data_process/process/content_extract.py @@ -7,14 +7,14 @@ valid_keys = [ "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", - "Competitor_Mindset", "Cognitive_Stage", "Follow_up_Priority", "Last_Interaction", "Referral_Potential" + "Competitor_Mindset", "Cognitive_Stage", "Last_Interaction", "Referral_Potential" ] ch_valid_keys = [ "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", - "竞争者心态", "认知阶段", "跟进优先级", "最后互动时间", "推荐潜力" + "竞争者心态", "认知阶段", "最后互动时间", "推荐潜力" ] all_keys = valid_keys + ["session_id", "label"] en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} @@ -22,7 +22,7 @@ d1_keys = valid_keys[:5] d2_keys = valid_keys[5:10] d3_keys = valid_keys[10:15] d4_keys = valid_keys[15:19] -d5_keys = valid_keys[19:23] +d5_keys = valid_keys[19:22] def extract_json_files(folder: str): json_files = glob.glob(os.path.join(folder, "*.json")) @@ -47,17 +47,17 @@ def try_match_error_key(error_key: str): else: return None -def filt_json_data(json_data: dict): +def filt_json_data(json_data: dict, threshold: int = 10): new_json_data = {} for k, v in json_data.items(): - if len(v) >= 10: + if len(v) >= threshold and len(v) != 0: new_json_data[k] = v - print(f"Total {len(new_json_data)} json keys after filter") + print(f"Total {len(new_json_data)} json data after filter with threshold {threshold}") return new_json_data -def extract_json_data(json_files: list) -> dict: +def extract_json_data(json_files: list, threshold: int = 10) -> dict: data = {} for json_file in json_files: session_id = os.path.basename(json_file).split(".")[0] @@ -66,24 +66,18 @@ def extract_json_data(json_files: list) -> dict: json_data = json.load(f) for key, value in json_data.items(): if key in valid_keys: - data[session_id][key] = value['value'] + data[session_id][key] = value.get("value", None) + elif key == "Follow_up_Priority": + continue else: match_key = try_match_error_key(key) if match_key: - data[session_id][match_key] = value['value'] + data[session_id][match_key] = value.get("value", None) else: raise ValueError(f"Invalid key {key} in {json_file}") - return filt_json_data(data) + return filt_json_data(data, threshold) if __name__=="__main__": deal_folder = "deal" not_deal_folder = "not_deal" - - deal_json_files = extract_json_files(deal_folder) - deal_data = extract_json_data(deal_json_files) - deal_txt_files = extract_txt_files(deal_folder) - - not_deal_json_files = extract_json_files(not_deal_folder) - not_deal_data = extract_json_data(not_deal_json_files) - not_deal_txt_files = extract_txt_files(not_deal_folder) From f3959031648720894e92daa8e82df3b2b8fbbd76 Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:36:56 +0800 Subject: [PATCH 02/12] Update data_process/process/preprocess.py --- data_process/process/preprocess.py | 390 ++++++++++++++++------------- 1 file changed, 218 insertions(+), 172 deletions(-) diff --git a/data_process/process/preprocess.py b/data_process/process/preprocess.py index aeb4eed..e55d65c 100644 --- a/data_process/process/preprocess.py +++ b/data_process/process/preprocess.py @@ -1,172 +1,218 @@ -import random -import numpy as np - -from torch.utils.data import Dataset, DataLoader, random_split -import torch -from transformers import AutoTokenizer - -from .content_extract import extract_json_files, extract_json_data - -valid_keys = [ - "Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index", - "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", - "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", - "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", - "Competitor_Mindset", "Cognitive_Stage", "Follow_up_Priority", "Last_Interaction", "Referral_Potential" -] -ch_valid_keys = [ - "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", - "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", - "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", - "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", - "竞争者心态", "认知阶段", "跟进优先级", "最后互动时间", "推荐潜力" -] -all_keys = valid_keys + ["session_id", "label"] -en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} -d1_keys = valid_keys[:5] -d2_keys = valid_keys[5:10] -d3_keys = valid_keys[10:15] -d4_keys = valid_keys[15:19] -d5_keys = valid_keys[19:23] - -class Formatter: - def __init__(self, en2ch): - self.en2ch = en2ch - - def _build_user_profile(self, profile: dict) -> str: - sections = [] - sections.append("[客户画像]") - sections.append("\n [痛感和焦虑等级]") - for key in d1_keys: - if key in profile: - sections.append(f"{self.en2ch[key]}: {profile[key]}") - - sections.append("\n [支付意愿与能力]") - for key in d2_keys: - if key in profile: - sections.append(f"{self.en2ch[key]}: {profile[key]}") - - sections.append("\n [成交阻力与防御机制]") - for key in d3_keys: - if key in profile: - sections.append(f"{self.en2ch[key]}: {profile[key]}") - - sections.append("\n [情绪钩子与成交切入点]") - for key in d4_keys: - if key in profile: - sections.append(f"{self.en2ch[key]}: {profile[key]}") - - sections.append("\n [客户生命周期状态]") - for key in d5_keys: - if key in profile: - sections.append(f"{self.en2ch[key]}: {profile[key]}") - return "\n".join(sections) - - def get_llm_prompt(self, features): - user_profile = self._build_user_profile(features) - - prompt = f""" - 你是一个销售心理学专家,请分析以下客户特征: - - {user_profile} - - 请提取客户的核心购买驱动力和主要障碍后分析该客户的成交概率。将成交概率以JSON格式输出: - {{ - "conversion_probability": 0-1之间的数值 - }} - """ - - messages = [ - {"role": "user", "content": prompt} - ] - return messages - -class TransDataset(Dataset): - def __init__(self, deal_data_folder, not_deal_data_folder): - self.deal_data = extract_json_data(extract_json_files(deal_data_folder)) - self.not_deal_data = extract_json_data(extract_json_files(not_deal_data_folder)) - - self.formatter = Formatter(en2ch) - - num_deal = len(self.deal_data) - num_not_deal = len(self.not_deal_data) - num_threshold = max(num_deal, num_not_deal) * 0.8 - - if not all([num_deal >= num_threshold, num_not_deal >= num_threshold]): - self._balance_samples() - - self._build_samples() - - def _build_samples(self): - self.samples = [] - - for id, features in self.deal_data.items(): - messages = self.formatter.get_llm_prompt(features) - self.samples.append((id, messages, 1)) - for id, features in self.not_deal_data.items(): - messages = self.formatter.get_llm_prompt(features) - self.samples.append((id, messages, 0)) - - random.shuffle(self.samples) - print(f"total samples num: {len(self.samples)}, deal num: {len(self.deal_data)}, not deal num: {len(self.not_deal_data)}") - - def _balance_samples(self): - random.seed(42) - np.random.seed(42) - - not_deal_ids = list(self.not_deal_data.keys()) - target_size = len(self.deal_data) - - if len(not_deal_ids) > target_size: - selected_not_deal_ids = random.sample(not_deal_ids, target_size) - self.not_deal_data = {sid: self.not_deal_data[sid] for sid in selected_not_deal_ids} - - def __len__(self): - return len(self.samples) - - def __getitem__(self, idx): - id, prompt, label = self.samples[idx] - return id, prompt, label - -def build_dataloader(deal_data_folder, not_deal_data_folder, batch_size): - dataset = TransDataset(deal_data_folder, not_deal_data_folder) - num_data = len(dataset) - - train_size = int(0.8 * num_data) - val_size = int(0.1 * num_data) - test_size = num_data - train_size - val_size - - print(f"train size: {train_size}") - print(f"val size: {val_size}") - print(f"test size: {test_size}") - train_dataset, val_dataset, test_dataset = random_split( - dataset, - [train_size, val_size, test_size], - generator=torch.Generator().manual_seed(42) - ) - - def collate_fn(batch): - ids = [item[0] for item in batch] - texts = [item[1] for item in batch] - labels = torch.tensor([item[2] for item in batch], dtype=torch.long) - return ids, texts, labels - - train_loader = DataLoader( - train_dataset, - batch_size=batch_size, - shuffle=True, - collate_fn=collate_fn - ) - val_loader = DataLoader( - val_dataset, - batch_size=batch_size, - shuffle=False, - collate_fn=collate_fn - ) - test_loader = DataLoader( - test_dataset, - batch_size=batch_size, - shuffle=False, - collate_fn=collate_fn - ) - return {"train": train_loader, "val": val_loader, "test": test_loader} \ No newline at end of file +import random +import numpy as np + +from torch.utils.data import Dataset, DataLoader, random_split +import torch +from transformers import AutoTokenizer + +from .content_extract import extract_json_files, extract_json_data + +valid_keys = [ + "Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index", + "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", + "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", + "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", + "Competitor_Mindset", "Cognitive_Stage", "Last_Interaction", "Referral_Potential" +] +ch_valid_keys = [ + "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", + "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", + "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", + "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", + "竞争者心态", "认知阶段", "最后互动时间", "推荐潜力" +] +all_keys = valid_keys + ["session_id", "label"] +en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} +d1_keys = valid_keys[:5] +d2_keys = valid_keys[5:10] +d3_keys = valid_keys[10:15] +d4_keys = valid_keys[15:19] +d5_keys = valid_keys[19:22] + +class Formatter: + def __init__(self, en2ch): + self.en2ch = en2ch + + def _build_user_profile(self, profile: dict) -> str: + sections = [] + sections.append("[客户画像]") + sections.append("\n [痛感和焦虑等级]") + for key in d1_keys: + if key in profile: + if profile[key] is None: + continue + sections.append(f"{self.en2ch[key]}: {profile[key]}") + + sections.append("\n [支付意愿与能力]") + for key in d2_keys: + if key in profile: + if profile[key] is None: + continue + sections.append(f"{self.en2ch[key]}: {profile[key]}") + + sections.append("\n [成交阻力与防御机制]") + for key in d3_keys: + if key in profile: + if profile[key] is None: + continue + sections.append(f"{self.en2ch[key]}: {profile[key]}") + + sections.append("\n [情绪钩子与成交切入点]") + for key in d4_keys: + if key in profile: + if profile[key] is None: + continue + sections.append(f"{self.en2ch[key]}: {profile[key]}") + + sections.append("\n [客户生命周期状态]") + for key in d5_keys: + if key in profile: + if profile[key] is None: + continue + sections.append(f"{self.en2ch[key]}: {profile[key]}") + return "\n".join(sections) + + def get_llm_prompt(self, features: dict) -> list: + user_profile = self._build_user_profile(features) + + prompt = prompt = f""" + 请分析以下客户特征,预测成交概率(0~1之间)。 + + {user_profile} + + 成交概率: + """ + + messages = [ + {"role": "user", "content": prompt} + ] + return messages + +class TransDataset(Dataset): + def __init__(self, deal_data_folder, not_deal_data_folder, threshold: int = 10, balance: bool = True): + self.deal_data = extract_json_data(extract_json_files(deal_data_folder), threshold) + self.not_deal_data = extract_json_data(extract_json_files(not_deal_data_folder), threshold) + + self.formatter = Formatter(en2ch) + + num_deal = len(self.deal_data) + num_not_deal = len(self.not_deal_data) + num_threshold = max(num_deal, num_not_deal) * 0.8 + + if not all([num_deal >= num_threshold, num_not_deal >= num_threshold]) and balance: + self._balance_samples() + + self._build_samples() + + def _build_samples(self): + self.samples = [] + + for id, features in self.deal_data.items(): + messages = self.formatter.get_llm_prompt(features) + self.samples.append((id, messages, 1)) + for id, features in self.not_deal_data.items(): + messages = self.formatter.get_llm_prompt(features) + self.samples.append((id, messages, 0)) + + random.shuffle(self.samples) + print(f"total samples num: {len(self.samples)}, deal num: {len(self.deal_data)}, not deal num: {len(self.not_deal_data)}") + + def _balance_samples(self): + random.seed(42) + np.random.seed(42) + + not_deal_ids = list(self.not_deal_data.keys()) + target_size = len(self.deal_data) + + if len(not_deal_ids) > target_size: + selected_not_deal_ids = random.sample(not_deal_ids, target_size) + self.not_deal_data = {sid: self.not_deal_data[sid] for sid in selected_not_deal_ids} + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + id, prompt, label = self.samples[idx] + return id, prompt, label + +class OfflineTransDataset(Dataset): + def __init__(self, deal_data_folder, not_deal_data_folder, threshold: int = 10): + self.deal_data = extract_json_data(extract_json_files(deal_data_folder), threshold) + self.not_deal_data = extract_json_data(extract_json_files(not_deal_data_folder), threshold) + + self.formatter = Formatter(en2ch) + self.samples = [] + + for id, features in self.deal_data.items(): + messages = self.formatter.get_llm_prompt(features) + self.samples.append((id, messages, 1)) + for id, features in self.not_deal_data.items(): + messages = self.formatter.get_llm_prompt(features) + self.samples.append((id, messages, 0)) + + def __len__(self): + return len(self.samples) + + def __getitem__(self, idx): + id, prompt, label = self.samples[idx] + return id, prompt, label + +def build_dataloader(deal_data_folder, not_deal_data_folder, batch_size, threshold: int = 10, balance: bool = True): + dataset = TransDataset(deal_data_folder, not_deal_data_folder, threshold, balance) + num_data = len(dataset) + + train_size = int(0.8 * num_data) + val_size = int(0.1 * num_data) + test_size = num_data - train_size - val_size + + print(f"train size: {train_size}") + print(f"val size: {val_size}") + print(f"test size: {test_size}") + train_dataset, val_dataset, test_dataset = random_split( + dataset, + [train_size, val_size, test_size], + generator=torch.Generator().manual_seed(42) + ) + + def collate_fn(batch): + ids = [item[0] for item in batch] + texts = [item[1] for item in batch] + labels = torch.tensor([item[2] for item in batch], dtype=torch.long) + return ids, texts, labels + + train_loader = DataLoader( + train_dataset, + batch_size=batch_size, + shuffle=True, + collate_fn=collate_fn + ) + val_loader = DataLoader( + val_dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=collate_fn + ) + test_loader = DataLoader( + test_dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=collate_fn + ) + return {"train": train_loader, "val": val_loader, "test": test_loader} + +def build_offline_dataloader(deal_data_folder, not_deal_data_folder, batch_size, threshold: int = 10): + dataset = OfflineTransDataset(deal_data_folder, not_deal_data_folder, threshold) + + def collate_fn(batch): + ids = [item[0] for item in batch] + texts = [item[1] for item in batch] + labels = torch.tensor([item[2] for item in batch], dtype=torch.long) + return ids, texts, labels + + offline_loader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=True, + collate_fn=collate_fn + ) + return offline_loader \ No newline at end of file From daa5d12fcd8e4631bb5741f820ed7fd6e3b7e9cc Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:37:39 +0800 Subject: [PATCH 03/12] Update model/__init__.py --- model/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/model/__init__.py b/model/__init__.py index 8d22e5a..4692f80 100644 --- a/model/__init__.py +++ b/model/__init__.py @@ -1 +1,2 @@ -from .modelling import TransClassifier \ No newline at end of file +from .modelling import TransClassifier +from .focal_loss import FocalLoss \ No newline at end of file From 14ce733d362b2b76cd1f037d0ce8bedb18dffc9e Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:37:55 +0800 Subject: [PATCH 04/12] Update model/modelling.py --- model/modelling.py | 107 ++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 51 deletions(-) diff --git a/model/modelling.py b/model/modelling.py index 73bb2d8..8baa4ae 100644 --- a/model/modelling.py +++ b/model/modelling.py @@ -1,52 +1,57 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from transformers import AutoModel - -class TransClassifier(nn.Module): - def __init__(self, model_dir: str, device: str="cuda"): - super().__init__() - self.backbone = AutoModel.from_pretrained( - model_dir, - dtype = "bfloat16" - ).to(device).eval() - self.device = device - self.torch_dtype = torch.bfloat16 - self.hidden_size = self.backbone.config.hidden_size - - self.classifier = nn.Sequential( - nn.LayerNorm(self.hidden_size), - nn.Linear(self.hidden_size, self.hidden_size//2), - nn.GELU(), - nn.Dropout(0.3), - nn.Linear(self.hidden_size//2, self.hidden_size//4), - nn.GELU(), - nn.Dropout(0.2), - nn.Linear(self.hidden_size//4, 2) - ).to(device=device, dtype=self.torch_dtype) - - for param in self.backbone.parameters(): - param.requires_grad = False - - def forward(self, model_inputs: dict): - outputs = self.backbone(**model_inputs) - - last_hidden_state = outputs.last_hidden_state - # take last token hidden state - cls_hidden_state = last_hidden_state[:, -1, :] - - logits = self.classifier(cls_hidden_state) - return logits - -if __name__ == "__main__": - model_dir = r"C:\Users\GA\Desktop\models\Qwen3-1.7B" - device = "cuda" - model = TransClassifier(model_dir, device) - print(model.hidden_size) - print(model) - - total_params = sum(p.numel() for p in model.parameters()) - trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - - print(f"总参数量: {total_params:,}") +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import AutoModel + +class TransClassifier(nn.Module): + def __init__(self, model_dir: str, output_classes: int, device: str="cuda"): + super().__init__() + self.backbone = AutoModel.from_pretrained( + model_dir, + dtype = "bfloat16", + attn_implementation="flash_attention_2" + ).to(device).eval() + self.device = device + self.torch_dtype = torch.bfloat16 + self.hidden_size = self.backbone.config.hidden_size + + self.token_proj = nn.Linear(self.hidden_size, self.hidden_size).to(device=device, dtype=self.torch_dtype) + self.classifier = nn.Sequential( + nn.LayerNorm(self.hidden_size), + nn.Linear(self.hidden_size, self.hidden_size//2), + nn.GELU(), + nn.Dropout(0.3), + nn.Linear(self.hidden_size//2, self.hidden_size//4), + nn.GELU(), + nn.Dropout(0.2), + nn.Linear(self.hidden_size//4, output_classes) + ).to(device=device, dtype=self.torch_dtype) + + for param in self.backbone.parameters(): + param.requires_grad = False + + def forward(self, model_inputs: dict): + outputs = self.backbone(**model_inputs) + proj_states = self.token_proj(outputs.last_hidden_state) + + attention_mask = model_inputs['attention_mask'] + mask_expanded = attention_mask.unsqueeze(-1).expand_as(proj_states).to(proj_states.dtype) + sum_states = (proj_states * mask_expanded).sum(dim=1) + valid_tokens = mask_expanded.sum(dim=1) + pooled = sum_states / valid_tokens.clamp(min=1e-9) + + logits = self.classifier(pooled) + return logits + +if __name__ == "__main__": + model_dir = r"C:\Users\GA\Desktop\models\Qwen3-1.7B" + device = "cuda" + model = TransClassifier(model_dir, device) + print(model.hidden_size) + print(model) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + print(f"总参数量: {total_params:,}") print(f"可训练参数量: {trainable_params:,}") \ No newline at end of file From 5a094286e04c6e0035099573165f23c79637093b Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:39:04 +0800 Subject: [PATCH 05/12] Add model/focal_loss.py --- model/focal_loss.py | 135 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 model/focal_loss.py diff --git a/model/focal_loss.py b/model/focal_loss.py new file mode 100644 index 0000000..d38d61f --- /dev/null +++ b/model/focal_loss.py @@ -0,0 +1,135 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class FocalLoss(nn.Module): + def __init__(self, gamma=2, alpha=None, reduction='mean', task_type='binary', num_classes=None): + """ + Unified Focal Loss class for binary, multi-class, and multi-label classification tasks. + :param gamma: Focusing parameter, controls the strength of the modulating factor (1 - p_t)^gamma + :param alpha: Balancing factor, can be a scalar or a tensor for class-wise weights. If None, no class balancing is used. + :param reduction: Specifies the reduction method: 'none' | 'mean' | 'sum' + :param task_type: Specifies the type of task: 'binary', 'multi-class', or 'multi-label' + :param num_classes: Number of classes (only required for multi-class classification) + """ + super(FocalLoss, self).__init__() + self.gamma = gamma + self.alpha = alpha + self.reduction = reduction + self.task_type = task_type + self.num_classes = num_classes + + # Handle alpha for class balancing in multi-class tasks + if task_type == 'multi-class' and alpha is not None and isinstance(alpha, (list, torch.Tensor)): + assert num_classes is not None, "num_classes must be specified for multi-class classification" + if isinstance(alpha, list): + self.alpha = torch.Tensor(alpha) + else: + self.alpha = alpha + + def forward(self, inputs, targets): + """ + Forward pass to compute the Focal Loss based on the specified task type. + :param inputs: Predictions (logits) from the model. + Shape: + - binary/multi-label: (batch_size, num_classes) + - multi-class: (batch_size, num_classes) + :param targets: Ground truth labels. + Shape: + - binary: (batch_size,) + - multi-label: (batch_size, num_classes) + - multi-class: (batch_size,) + """ + if self.task_type == 'binary': + return self.binary_focal_loss(inputs, targets) + elif self.task_type == 'multi-class': + return self.multi_class_focal_loss(inputs, targets) + elif self.task_type == 'multi-label': + return self.multi_label_focal_loss(inputs, targets) + else: + raise ValueError( + f"Unsupported task_type '{self.task_type}'. Use 'binary', 'multi-class', or 'multi-label'.") + + def binary_focal_loss(self, inputs, targets): + """ Focal loss for binary classification. """ + probs = torch.sigmoid(inputs) + targets = targets.float() + + # Compute binary cross entropy + bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none') + + # Compute focal weight + p_t = probs * targets + (1 - probs) * (1 - targets) + focal_weight = (1 - p_t) ** self.gamma + + # Apply alpha if provided + if self.alpha is not None: + alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets) + bce_loss = alpha_t * bce_loss + + # Apply focal loss weighting + loss = focal_weight * bce_loss + + if self.reduction == 'mean': + return loss.mean() + elif self.reduction == 'sum': + return loss.sum() + return loss + + def multi_class_focal_loss(self, inputs, targets): + """ Focal loss for multi-class classification. """ + if self.alpha is not None: + alpha = self.alpha.to(inputs.device) + + # Convert logits to probabilities with softmax + probs = F.softmax(inputs, dim=1) + + # One-hot encode the targets + targets_one_hot = F.one_hot(targets, num_classes=self.num_classes).float() + + # Compute cross-entropy for each class + ce_loss = -targets_one_hot * torch.log(probs) + + # Compute focal weight + p_t = torch.sum(probs * targets_one_hot, dim=1) # p_t for each sample + focal_weight = (1 - p_t) ** self.gamma + + # Apply alpha if provided (per-class weighting) + if self.alpha is not None: + alpha_t = alpha.gather(0, targets) + ce_loss = alpha_t.unsqueeze(1) * ce_loss + + # Apply focal loss weight + loss = focal_weight.unsqueeze(1) * ce_loss + + if self.reduction == 'mean': + return loss.mean() + elif self.reduction == 'sum': + return loss.sum() + return loss + + def multi_label_focal_loss(self, inputs, targets): + """ Focal loss for multi-label classification. """ + probs = torch.sigmoid(inputs) + + # Compute binary cross entropy + bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none') + + # Compute focal weight + p_t = probs * targets + (1 - probs) * (1 - targets) + focal_weight = (1 - p_t) ** self.gamma + + # Apply alpha if provided + if self.alpha is not None: + alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets) + bce_loss = alpha_t * bce_loss + + # Apply focal loss weight + loss = focal_weight * bce_loss + + if self.reduction == 'mean': + return loss.mean() + elif self.reduction == 'sum': + return loss.sum() + return loss From 39bd04f0e00c20bb23a57276af1a6329fe56fdf5 Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:39:58 +0800 Subject: [PATCH 06/12] Add offline_test.py --- offline_test.py | 130 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 offline_test.py diff --git a/offline_test.py b/offline_test.py new file mode 100644 index 0000000..32e8d6a --- /dev/null +++ b/offline_test.py @@ -0,0 +1,130 @@ +from data_process import build_offline_dataloader +from model import TransClassifier + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import pandas as pd +import numpy as np +import os +import json +import gc +from tqdm import tqdm +import warnings +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score +from transformers import AutoTokenizer +warnings.filterwarnings("ignore") + + +def offline_test( + deal_data_folder, not_deal_data_folder, batch_size, threshold, + backbone_dir, ckpt_path, device, filtered=False + ): + offline_loader = build_offline_dataloader(deal_data_folder, not_deal_data_folder, batch_size, threshold) + + tokenizer = AutoTokenizer.from_pretrained(backbone_dir) + model = TransClassifier(backbone_dir, 2, device) + model.load_state_dict(torch.load(ckpt_path, map_location=device)) + model.eval() + + all_ids = [] + all_preds = [] + all_probs = [] + all_labels = [] + + pbar = tqdm(offline_loader, desc="Testing") + with torch.inference_mode(): + for batch_idx, (ids, texts, labels) in enumerate(pbar): + all_ids.extend(ids) + labels = labels.to(device) + + texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False) + inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) + + with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): + outputs = model(inputs) + preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist() + outputs_float = outputs.float() # 转换为 float32 避免精度问题 + probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2] + probs = probs.cpu().numpy().tolist() + probs = [p[1] for p in probs] + + all_preds.extend(preds) + all_probs.extend(probs) + all_labels.extend(labels.cpu().numpy()) + + # 清理内存 + del texts, labels, outputs + torch.cuda.empty_cache() + gc.collect() + + # 计算评估指标 + accuracy = accuracy_score(all_labels, all_preds) + precision = precision_score(all_labels, all_preds, average="weighted") + recall = recall_score(all_labels, all_preds, average="weighted") + f1 = f1_score(all_labels, all_preds, average="weighted") + auc = roc_auc_score(all_labels, all_probs) + cm = confusion_matrix(all_labels, all_preds) + + precision_per_class = precision_score(all_labels, all_preds, average=None) + recall_per_class = recall_score(all_labels, all_preds, average=None) + f1_per_class = f1_score(all_labels, all_preds, average=None) + + test_results = { + "accuracy": accuracy, + "precision_weighted": precision, + "recall_weighted": recall, + "f1_weighted": f1, + "auc": auc, + "confusion_matrix": cm.tolist(), + "class_0_precision": precision_per_class[0], + "class_0_recall": recall_per_class[0], + "class_0_f1": f1_per_class[0], + "class_1_precision": precision_per_class[1], + "class_1_recall": recall_per_class[1], + "class_1_f1": f1_per_class[1] + } + + if filtered: + with open(f"offline_test_result_filtered_{threshold}.json", "w", encoding="utf-8") as f: + json.dump(test_results, f, ensure_ascii=False, indent=4) + else: + with open(f"offline_test_result_{threshold}.json", "w", encoding="utf-8") as f: + json.dump(test_results, f, ensure_ascii=False, indent=4) + + pred_df = pd.DataFrame({ + "ids": all_ids, + "predictions": all_preds, + "probability": all_probs, + "true_labels": all_labels + }) + if filtered: + pred_df.to_csv(f"offline_test_predictions_filtered_{threshold}.csv", index=False, encoding="utf-8") + else: + pred_df.to_csv(f"offline_test_predictions_{threshold}.csv", index=False, encoding="utf-8") + +if __name__=="__main__": + filtered_deal_folder = "not_trained_deal_filtered" + filtered_not_deal_folder = "not_trained_not_deal_filtered" + + deal_folder = "not_trained_deal" + not_deal_folder = "not_trained_not_deal" + + batch_size = 8 + backbone_dir = "Qwen3-1.7B" + + """ for i in range(3, 9): + threshold = i + ckpt_path = f"best_ckpt_threshold_{threshold}.pth" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + + offline_test(deal_folder, not_deal_folder, batch_size, threshold, backbone_dir, ckpt_path, device, filtered=False) + offline_test(filtered_deal_folder, filtered_not_deal_folder, batch_size, threshold, backbone_dir, ckpt_path, device, filtered=True) """ + + threshold = 5 + ckpt_path = f"best_ckpt_threshold_{threshold}_1st.pth" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + offline_test(deal_folder, not_deal_folder, batch_size, threshold, backbone_dir, ckpt_path, device, filtered=False) + offline_test(filtered_deal_folder, filtered_not_deal_folder, batch_size, threshold, backbone_dir, ckpt_path, device, filtered=True) \ No newline at end of file From b00247cfb7c8dd2b2e8d2d2665cc82ae8accb4cd Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:40:28 +0800 Subject: [PATCH 07/12] Update train.py --- train.py | 472 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 323 insertions(+), 149 deletions(-) diff --git a/train.py b/train.py index 78b91f6..3d31f47 100644 --- a/train.py +++ b/train.py @@ -1,149 +1,323 @@ -from data_process import build_dataloader -from model import TransClassifier - -import torch -import torch.nn as nn -import torch.nn.functional as F -from transformers import AutoTokenizer - -import pandas as pd -import numpy as np -import os -import json -from datetime import datetime -import gc -from tqdm import tqdm -import warnings -warnings.filterwarnings("ignore") - -class EarlyStopping: - def __init__(self, patience=5, delta=0, path='checkpoint.pt'): - self.patience = patience - self.counter = 0 - self.best_score = None - self.early_stop = False - self.val_loss_min = np.inf - self.delta = delta - self.path = path - - def __call__(self, val_loss, model): - score = -val_loss - - if self.best_score is None: - self.best_score = score - self.save_checkpoint(val_loss, model) - elif score < self.best_score + self.delta: - self.counter += 1 - print(f'EarlyStopping counter: {self.counter} out of {self.patience}') - if self.counter >= self.patience: - self.early_stop = True - else: - self.best_score = score - self.save_checkpoint(val_loss, model) - self.counter = 0 - - def save_checkpoint(self, val_loss, model): - print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model...') - torch.save(model.state_dict(), self.path) - self.val_loss_min = val_loss - -def train(backbone_dir, deal_folder, not_deal_folder, - batch_size, initial_lr=1e-5, max_epochs=100, - best_ckpt_path="best_ckpt.pth", final_ckpt_path="final_ckpt.pth", device="cuda"): - - data_dict = build_dataloader(deal_folder, not_deal_folder, batch_size) - train_loader = data_dict["train"] - val_loader = data_dict["val"] - - tokenizer = AutoTokenizer.from_pretrained(backbone_dir) - model = TransClassifier(backbone_dir, device) - model.to(device) - - optimizer = torch.optim.AdamW(model.parameters(), lr=initial_lr) - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) - - loss_func = nn.CrossEntropyLoss() - - early_stopping = EarlyStopping(path=best_ckpt_path) - history = {"train_loss": [], "val_loss": [], "epoch": []} - - for epoch in range(max_epochs): - model.train() - total_loss = 0.0 - train_steps = 0 - - if epoch == 2: - for param in model.backbone.parameters(): - param.requires_grad = True - print("Unfreeze backbone parameters") - - pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{max_epochs} [Train]') - for batch_idx, (ids, texts, labels) in enumerate(pbar): - labels = labels.to(device) - - texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False) - inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) - with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): - outputs = model(inputs) - loss = loss_func(outputs, labels) - - optimizer.zero_grad() - loss.backward() - nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) - optimizer.step() - scheduler.step() - - total_loss += loss.item() - train_steps += 1 - - train_loss = total_loss / train_steps - pbar.set_postfix({"train_loss": train_loss}) - - del texts, labels, outputs, loss - torch.cuda.empty_cache() - gc.collect() - - val_loss = val(val_loader, model, loss_func, tokenizer, device) - history["train_loss"].append(total_loss / len(train_loader)) - history["val_loss"].append(val_loss) - history["epoch"].append(epoch+1) - - print(f"Epoch {epoch+1}/{max_epochs}, Train Loss: {total_loss / len(train_loader):.4f}, Val Loss: {val_loss:.4f}") - - early_stopping(val_loss, model) - if early_stopping.early_stop: - print("Early stopping") - break - - torch.save(model.state_dict(), final_ckpt_path) - print(f"Final model saved to {final_ckpt_path}") - - history_df = pd.DataFrame(history) - history_df.to_csv("training_history.csv", index=False) - print("Training history saved to training_history.csv") - -def val(val_loader, model, loss_func, tokenizer, device): - model.eval() - val_loss = 0.0 - with torch.no_grad(): - for batch_idx, (ids, texts, labels) in enumerate(val_loader): - labels = labels.to(device) - - texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False) - inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) - with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): - outputs = model(inputs) - loss = loss_func(outputs, labels) - - val_loss += loss.item() - return val_loss / len(val_loader) - -if __name__ == "__main__": - backbone_dir = r"C:\Users\GA\Desktop\models\Qwen3-1.7B" - deal_folder = "deal" - not_deal_folder = "not_deal" - batch_size = 8 - device = "cuda" - - - train(backbone_dir, deal_folder, not_deal_folder, batch_size, device=device) \ No newline at end of file +from data_process import build_dataloader +from model import TransClassifier, FocalLoss + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import AutoTokenizer + +import pandas as pd +import numpy as np +import os +import json +from datetime import datetime +import gc +from tqdm import tqdm +import warnings +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score +warnings.filterwarnings("ignore") + +class EarlyStopping: + def __init__(self, patience=5, delta=0): + self.patience = patience + self.counter = 0 + self.best_score = None + self.early_stop = False + self.val_loss_min = np.inf + self.delta = delta + + def __call__(self, val_loss, model, best_ckpt_path): + score = -val_loss + + if self.best_score is None: + self.best_score = score + self.save_checkpoint(val_loss, model, best_ckpt_path) + + elif score < self.best_score + self.delta: + self.counter += 1 + print(f'EarlyStopping counter: {self.counter} out of {self.patience}') + if self.counter >= self.patience: + self.early_stop = True + else: + self.best_score = score + self.save_checkpoint(val_loss, model, best_ckpt_path) + self.counter = 0 + + def save_checkpoint(self, val_loss, model, best_ckpt_path): + print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model...') + torch.save(model.state_dict(), best_ckpt_path) + self.val_loss_min = val_loss + +def train(backbone_dir, deal_folder, not_deal_folder, + batch_size, initial_lr=1e-5, max_epochs=100, threshold: int = 10, + device="cuda", use_focal_loss=False, balance=True): + best_ckpt_path = f"best_ckpt_threshold_{threshold}.pth" + + tokenizer = AutoTokenizer.from_pretrained(backbone_dir) + if use_focal_loss: + model = TransClassifier(backbone_dir, output_classes=1, device=device) + if balance: + loss_func = FocalLoss( + gamma=2.0, + alpha=0.5, + reduction='mean', + task_type='binary') + else: + loss_func = FocalLoss( + gamma=2.0, + alpha=0.8, + reduction='mean', + task_type='binary') + else: + model = TransClassifier(backbone_dir, output_classes=2, device=device) + loss_func = nn.CrossEntropyLoss() + assert balance == True, "When not using CE loss, balance must be True." + model.to(device) + + data_dict = build_dataloader(deal_data_folder=deal_folder, not_deal_data_folder=not_deal_folder, batch_size=batch_size, threshold=threshold, balance=balance) + train_loader = data_dict["train"] + val_loader = data_dict["val"] + test_loader = data_dict["test"] + + optimizer = torch.optim.AdamW(model.parameters(), lr=initial_lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10) + + early_stopping = EarlyStopping(patience=10, delta=0) + history = {"train_loss": [], "val_loss": [], "epoch": []} + + for epoch in range(max_epochs): + model.train() + total_loss = 0.0 + train_steps = 0 + + if epoch == 2: + for param in model.backbone.parameters(): + param.requires_grad = True + print("Unfreeze backbone parameters") + + pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{max_epochs} [Train]') + for batch_idx, (ids, texts, labels) in enumerate(pbar): + labels = labels.to(device) + + texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False) + inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) + + with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): + outputs = model(inputs) + if use_focal_loss: + outputs = outputs.squeeze(1) + loss = loss_func(outputs, labels.float()) + else: + loss = loss_func(outputs, labels) + + optimizer.zero_grad() + loss.backward() + nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) + optimizer.step() + scheduler.step() + + total_loss += loss.item() + train_steps += 1 + + train_loss = total_loss / train_steps + pbar.set_postfix({"train_loss": train_loss}) + + del texts, labels, outputs, loss + torch.cuda.empty_cache() + gc.collect() + + val_loss = val(val_loader, model, loss_func, tokenizer, device, use_focal_loss) + train_loss_epoch = total_loss / len(train_loader) + history["train_loss"].append(train_loss_epoch) + history["val_loss"].append(val_loss) + history["epoch"].append(epoch+1) + + print(f"Epoch {epoch+1}/{max_epochs}, Train Loss: {total_loss / len(train_loader):.4f}, Val Loss: {val_loss:.4f}") + + early_stopping(val_loss, model, best_ckpt_path) + if early_stopping.early_stop: + print("Early stopping") + break + + history_df = pd.DataFrame(history) + history_df.to_csv(f"training_history_threshold_{threshold}.csv", index=False) + print(f"Training history saved to training_history_threshold_{threshold}.csv") + return test_loader + +def val(val_loader, model, loss_func, tokenizer, device, use_focal_loss=False): + model.eval() + val_loss = 0.0 + with torch.no_grad(): + for batch_idx, (ids, texts, labels) in enumerate(val_loader): + labels = labels.to(device) + + texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False) + inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) + with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): + outputs = model(inputs) + if use_focal_loss: + outputs = outputs.squeeze(1) + loss = loss_func(outputs, labels.float()) + else: + loss = loss_func(outputs, labels) + + val_loss += loss.item() + + del inputs, outputs, labels, loss + gc.collect() + torch.cuda.empty_cache() + return val_loss / len(val_loader) + +def test(backbone_dir, test_loader, device, threshold, use_focal_loss=False, balance=True): + tokenizer = AutoTokenizer.from_pretrained(backbone_dir) + + if use_focal_loss: + model = TransClassifier(backbone_dir, output_classes=1, device=device) + else: + model = TransClassifier(backbone_dir, output_classes=2, device=device) + model.to(device) + + ckpt_path = f"best_ckpt_threshold_{threshold}.pth" + if os.path.exists(ckpt_path): + model.load_state_dict(torch.load(ckpt_path, map_location=device)) + print(f"Model loaded from {ckpt_path}") + else: + print(f"Warning: {ckpt_path} not found. Using untrained model.") + + model.eval() + + all_ids = [] + all_preds = [] + all_probs = [] + all_labels = [] + + pbar = tqdm(test_loader, desc="Testing") + with torch.inference_mode(): + for batch_idx, (ids, texts, labels) in enumerate(pbar): + all_ids.extend(ids) + labels = labels.to(device) + + texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False) + inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) + + with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): + outputs = model(inputs) + if use_focal_loss: + outputs = outputs.squeeze(-1) # [B, 1] -> [B] + + if use_focal_loss: + outputs_float = outputs.float() # 转换为 float32 避免精度问题 + probs = torch.sigmoid(outputs_float).cpu().numpy().tolist() # [B] + preds = [1 if p >= 0.5 else 0 for p in probs] + else: + preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist() + + outputs_float = outputs.float() # 转换为 float32 避免精度问题 + probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2] + probs = probs.cpu().numpy().tolist() + probs = [p[1] for p in probs] + + all_preds.extend(preds) + all_probs.extend(probs) + all_labels.extend(labels.cpu().numpy()) + + # 清理内存 + del texts, labels, outputs + torch.cuda.empty_cache() + gc.collect() + + # 计算评估指标 + accuracy = accuracy_score(all_labels, all_preds) + precision = precision_score(all_labels, all_preds, average="weighted") + recall = recall_score(all_labels, all_preds, average="weighted") + f1 = f1_score(all_labels, all_preds, average="weighted") + auc = roc_auc_score(all_labels, all_probs) + cm = confusion_matrix(all_labels, all_preds) + + # 打印评估结果 + print("\n=== Test Results ===") + print(f"Accuracy: {accuracy:.4f}") + print(f"Precision: {precision:.4f}") + print(f"Recall: {recall:.4f}") + print(f"F1 Score: {f1:.4f}") + print(f"AUC: {auc:.4f}") + + cm_df = pd.DataFrame(cm, + index=['Actual Not Deal (0)', 'Actual Deal (1)'], + columns=['Predicted Not Deal (0)', 'Predicted Deal (1)']) + print("\nConfusion Matrix:") + print(cm_df) + + precision_per_class = precision_score(all_labels, all_preds, average=None) + recall_per_class = recall_score(all_labels, all_preds, average=None) + f1_per_class = f1_score(all_labels, all_preds, average=None) + print("\n=== Class-wise Metrics ===") + print("Class 0 (Not Deal):") + print(f" Precision: {precision_per_class[0]:.4f}") + print(f" Recall: {recall_per_class[0]:.4f}") + print(f" F1 Score: {f1_per_class[0]:.4f}") + print("\nClass 1 (Deal):") + print(f" Precision: {precision_per_class[1]:.4f}") + print(f" Recall: {recall_per_class[1]:.4f}") + print(f" F1 Score: {f1_per_class[1]:.4f}") + + test_results = { + "accuracy": accuracy, + "precision_weighted": precision, + "recall_weighted": recall, + "f1_weighted": f1, + "auc": auc, + "confusion_matrix": cm.tolist(), + "class_0_precision": precision_per_class[0], + "class_0_recall": recall_per_class[0], + "class_0_f1": f1_per_class[0], + "class_1_precision": precision_per_class[1], + "class_1_recall": recall_per_class[1], + "class_1_f1": f1_per_class[1], + "test_samples": len(all_labels) + } + + with open(f"test_results_threshold_{threshold}.json", "w", encoding="utf-8") as f: + json.dump(test_results, f, ensure_ascii=False, indent=4) + print(f"\nTest results saved to test_results_threshold_{threshold}.json") + + pred_df = pd.DataFrame({ + "ids": all_ids, + "predictions": all_preds, + "probability": all_probs, + "true_labels": all_labels + }) + pred_df.to_csv(f"test_predictions_threshold_{threshold}.csv", index=False, encoding="utf-8") + +if __name__ == "__main__": + backbone_dir = "Qwen3-1.7B" + deal_folder = "filtered_deal" + not_deal_folder = "filtered_not_deal" + batch_size = 4 + device = "cuda" + + """ threshold = 10 + test_loader = train(backbone_dir=backbone_dir, deal_folder=deal_folder, not_deal_folder=not_deal_folder, batch_size=batch_size, threshold=threshold, device=device, use_focal_loss=False, balance=True) + + test( + backbone_dir=backbone_dir, + test_loader=test_loader, + device=device, + threshold=threshold, + use_focal_loss=False, + balance=True + ) """ + + max_threshold = 10 + for i in range(3, 9): + print(f"Training with threshold {i}...") + test_loader = train(backbone_dir=backbone_dir, deal_folder=deal_folder, not_deal_folder=not_deal_folder, batch_size=batch_size, threshold=i, device=device, use_focal_loss=False, balance=True) + + test( + backbone_dir=backbone_dir, + test_loader=test_loader, + device=device, + threshold=i, + use_focal_loss=False, + balance=True + ) \ No newline at end of file From e7202bcdd78fb95bcc2e625670eba59bd33c071b Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:40:34 +0800 Subject: [PATCH 08/12] Delete test.py --- test.py | 154 -------------------------------------------------------- 1 file changed, 154 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index 8e44e3f..0000000 --- a/test.py +++ /dev/null @@ -1,154 +0,0 @@ -from data_process import build_dataloader -from model import TransClassifier - -import torch -import torch.nn as nn -import torch.nn.functional as F -from transformers import AutoTokenizer - -import pandas as pd -import numpy as np -import os -import json -from datetime import datetime -import gc -from tqdm import tqdm -import warnings -from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix -warnings.filterwarnings("ignore") - -def test(backbone_dir, deal_folder, not_deal_folder, batch_size, ckpt_path="best_ckpt.pth", device="cuda"): - """ - 测试模型在测试集上的表现 - - Args: - backbone_dir: 预训练模型目录 - deal_folder: 成交数据文件夹 - not_deal_folder: 非成交数据文件夹 - batch_size: 批量大小 - ckpt_path: 模型 checkpoint 路径 - device: 运行设备 - """ - # 加载测试数据 - data_dict = build_dataloader(deal_folder, not_deal_folder, batch_size) - test_loader = data_dict["test"] - print(f"Test data loaded successfully. Test samples: {len(test_loader.dataset)}") - - # 加载 tokenizer 和模型 - tokenizer = AutoTokenizer.from_pretrained(backbone_dir) - model = TransClassifier(backbone_dir, device) - model.to(device) - - # 加载训练好的模型权重 - if os.path.exists(ckpt_path): - model.load_state_dict(torch.load(ckpt_path, map_location=device)) - print(f"Model loaded from {ckpt_path}") - else: - print(f"Warning: {ckpt_path} not found. Using untrained model.") - - # 测试模型 - model.eval() - all_ids = [] - all_preds = [] - all_labels = [] - test_loss = 0.0 - - loss_func = nn.CrossEntropyLoss() - - pbar = tqdm(test_loader, desc="Testing") - with torch.inference_mode(): - for batch_idx, (ids, texts, labels) in enumerate(pbar): - all_ids.extend(ids) - labels = labels.to(device) - - # 处理输入数据 - texts = tokenizer.apply_chat_template(texts, tokenize=False, add_generation_prompt=True, enable_thinking=False) - inputs = tokenizer(texts, padding=True, truncation=True, max_length=2048, return_tensors="pt").to(device) - - with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): - outputs = model(inputs) - loss = loss_func(outputs, labels) - - test_loss += loss.item() - - # 计算预测结果 - preds = torch.argmax(outputs, dim=1).cpu().numpy() - all_preds.extend(preds) - all_labels.extend(labels.cpu().numpy()) - - # 清理内存 - del texts, labels, outputs, loss - torch.cuda.empty_cache() - gc.collect() - - # 计算评估指标 - avg_loss = test_loss / len(test_loader) - accuracy = accuracy_score(all_labels, all_preds) - precision = precision_score(all_labels, all_preds, average="weighted") - recall = recall_score(all_labels, all_preds, average="weighted") - f1 = f1_score(all_labels, all_preds, average="weighted") - cm = confusion_matrix(all_labels, all_preds) - - # 打印评估结果 - print("\n=== Test Results ===") - print(f"Average Loss: {avg_loss:.4f}") - print(f"Accuracy: {accuracy:.4f}") - print(f"Precision: {precision:.4f}") - print(f"Recall: {recall:.4f}") - print(f"F1 Score: {f1:.4f}") - print("\nConfusion Matrix:") - print(cm) - print("\n=== Class-wise Metrics ===") - print("Class 0 (Not Deal):") - print(f" Precision: {precision_score(all_labels, all_preds, average=None)[0]:.4f}") - print(f" Recall: {recall_score(all_labels, all_preds, average=None)[0]:.4f}") - print(f" F1 Score: {f1_score(all_labels, all_preds, average=None)[0]:.4f}") - print("\nClass 1 (Deal):") - print(f" Precision: {precision_score(all_labels, all_preds, average=None)[1]:.4f}") - print(f" Recall: {recall_score(all_labels, all_preds, average=None)[1]:.4f}") - print(f" F1 Score: {f1_score(all_labels, all_preds, average=None)[1]:.4f}") - - # 保存测试结果 - test_results = { - "average_loss": avg_loss, - "accuracy": accuracy, - "precision": precision, - "recall": recall, - "f1_score": f1, - "confusion_matrix": cm.tolist(), - "class_0_precision": precision_score(all_labels, all_preds, average=None)[0], - "class_0_recall": recall_score(all_labels, all_preds, average=None)[0], - "class_0_f1": f1_score(all_labels, all_preds, average=None)[0], - "class_1_precision": precision_score(all_labels, all_preds, average=None)[1], - "class_1_recall": recall_score(all_labels, all_preds, average=None)[1], - "class_1_f1": f1_score(all_labels, all_preds, average=None)[1], - "test_samples": len(all_labels), - "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - - # 保存预测结果 - pred_results = { - "ids": all_ids, - "predictions": all_preds, - "true_labels": all_labels - } - pred_df = pd.DataFrame(pred_results) - pred_df.to_csv("test_predictions.csv", index=False, encoding="utf-8") - - # 保存为 JSON 文件 - with open("test_results.json", "w", encoding="utf-8") as f: - json.dump(test_results, f, ensure_ascii=False, indent=2) - print("\nTest results saved to test_results.json") - return test_results - -if __name__ == "__main__": - # 配置参数 - backbone_dir = r"C:\Users\GA\Desktop\models\Qwen3-1.7B" - deal_folder = "deal" - not_deal_folder = "not_deal" - batch_size = 8 - ckpt_path = "best_ckpt.pth" - device = "cuda" - - # 运行测试 - test(backbone_dir, deal_folder, not_deal_folder, batch_size, ckpt_path, device) \ No newline at end of file From 98eb31bf69b12a28e6ea0a6a36e0c1fec39da962 Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 11:41:49 +0800 Subject: [PATCH 09/12] Add visualize_training.py --- visualize_training.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 visualize_training.py diff --git a/visualize_training.py b/visualize_training.py new file mode 100644 index 0000000..e4c8622 --- /dev/null +++ b/visualize_training.py @@ -0,0 +1,29 @@ +import pandas as pd +import matplotlib.pyplot as plt + +def visualize_training_history(threshold): + csv_path = f'training_history_threshold_{threshold}.csv' + + df = pd.read_csv(csv_path) + epochs = df['epoch'] + train_loss = df['train_loss'] + val_loss = df['val_loss'] + + plt.figure(figsize=(10, 6)) + plt.plot(epochs, train_loss, 'b-', label='Training Loss') + plt.plot(epochs, val_loss, 'r-', label='Validation Loss') + + plt.title('Training and Validation Loss Over Epochs (CE)') + plt.xlabel('Epochs') + plt.ylabel('Loss') + plt.legend() + plt.grid(True) + + plt.savefig(f'training_visualization_threshold_{threshold}.png') + plt.show() + + print(f'可视化完成,图表已保存为 training_visualization_threshold_{threshold}.png') + +if __name__ == "__main__": + for i in range(11): + visualize_training_history(i) \ No newline at end of file From 7d57a8446be8178c4984b091b3e733a0f0a5f145 Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 12:16:22 +0800 Subject: [PATCH 10/12] Update inference.py --- inference.py | 111 ++++++++++++++++++++++----------------------------- 1 file changed, 47 insertions(+), 64 deletions(-) diff --git a/inference.py b/inference.py index d13e641..8959c43 100644 --- a/inference.py +++ b/inference.py @@ -1,11 +1,10 @@ from model import TransClassifier from transformers import AutoTokenizer -from data_process import extract_json_data, Formatter +from data_process import extract_json_data, Formatter, load_data_from_dict import torch import json from typing import Dict, List, Optional import os -import random import warnings warnings.filterwarnings("ignore") @@ -14,14 +13,14 @@ valid_keys = [ "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", - "Competitor_Mindset", "Cognitive_Stage", "Follow_up_Priority", "Last_Interaction", "Referral_Potential" + "Competitor_Mindset", "Cognitive_Stage", "Last_Interaction", "Referral_Potential" ] ch_valid_keys = [ "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", - "竞争者心态", "认知阶段", "跟进优先级", "最后互动时间", "推荐潜力" + "竞争者心态", "认知阶段", "最后互动时间", "推荐潜力" ] all_keys = valid_keys + ["session_id", "label"] en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} @@ -29,7 +28,7 @@ d1_keys = valid_keys[:5] d2_keys = valid_keys[5:10] d3_keys = valid_keys[10:15] d4_keys = valid_keys[15:19] -d5_keys = valid_keys[19:23] +d5_keys = valid_keys[19:22] class InferenceEngine: def __init__(self, backbone_dir: str, ckpt_path: str = "best_ckpt.pth", device: str = "cuda"): @@ -42,7 +41,7 @@ class InferenceEngine: print(f"Tokenizer loaded from {backbone_dir}") # 加载模型 - self.model = TransClassifier(backbone_dir, device) + self.model = TransClassifier(backbone_dir, 2, device) self.model.to(device) if self.ckpt_path: self.model.load_state_dict(torch.load(ckpt_path, map_location=device)) @@ -57,25 +56,17 @@ class InferenceEngine: def inference_batch(self, json_list: List[str]) -> dict: """ 批量推理函数,输入为 JSON 字符串列表,输出为包含转换概率的字典列表。为防止OOM,列表最大长度为8。 - 请注意Json文件中的词条数必须大于等于10. + 请注意Json文件中的词条数必须大于等于5. """ - # print(111111) - assert len(json_list) <= 10, "单次输入json文件数量不可超过8。" - id2feature = extract_json_data(json_list) - print(json.dumps(id2feature ,indent=2 ,ensure_ascii=False)) - # id2feature + assert len(json_list) <= 8, "单次输入json文件数量不可超过8。" + id2feature = extract_json_data(json_files=json_list, threshold=5) message_list = [] for id, feature in id2feature.items(): messages = self.formatter.get_llm_prompt(feature) message_list.append(messages) - inputs = self.tokenizer.apply_chat_template( - message_list, - tokenize=False, - add_generation_prompt=True, - enable_thinking=False - ) + inputs = self.tokenizer.apply_chat_template(message_list, tokenize=False, add_generation_prompt=True, enable_thinking=False) model_inputs = self.tokenizer( inputs, padding=True, @@ -87,21 +78,12 @@ class InferenceEngine: with torch.inference_mode(): with torch.amp.autocast(device_type=self.device, dtype=torch.bfloat16): outputs = self.model(model_inputs) - - # 1. 计算分类标签(argmax) - preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist() - - # 2. 计算softmax概率(核心修正:转CPU、转numpy、转列表,解决Tensor序列化问题) - outputs_float = outputs.float() # 转换为 float32 避免精度问题 - probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2] - # 转换为CPU的numpy数组,再转列表(每个样本对应2个类别的概率) - probs = probs.cpu().numpy().tolist() - probs = [p[1] for p in probs] # 只保留类别1的概率 - - # 3. 计算置信度 - confidence = [abs(p - 0.5) * 2 for p in probs] - # 返回格式:labels是每个样本的分类标签列表,probs是每个样本的类别概率列表,confidence是每个样本的置信度列表 - return {"labels": preds, "probs": probs, "confidence": confidence} + preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist() + outputs_float = outputs.float() + probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2] + probs = probs.cpu().numpy().tolist() + probs = [p[1] for p in probs] + return {"labels": preds, "probs": probs} def inference_sample(self, json_path: str) -> dict: """ @@ -109,24 +91,21 @@ class InferenceEngine: 请注意Json文件中的词条数必须大于等于10. """ return self.inference_batch([json_path]) - - def inference( - self, - featurs : dict[str ,dict] - ): - assert len(featurs) <= 10, "单次输入json文件数量不可超过8。" + + def inference_batch_json_data(self, json_data: List[dict]) -> dict: + """ + 批量推理函数,输入为 JSON 数据,输出为包含转换概率的字典列表。为防止OOM,列表最大长度为8。 + 请注意Json文件中的词条数必须大于等于5. 但此处不进行过滤,请注意稍后对输出进行过滤。 + """ + assert len(json_data) <= 8, "单次输入json数据数量不可超过8。" + pseudo_id2feature = load_data_from_dict(json_data) + message_list = [] - for id, feature in featurs.items(): + for id, feature in pseudo_id2feature.items(): messages = self.formatter.get_llm_prompt(feature) message_list.append(messages) - inputs = self.tokenizer.apply_chat_template( - message_list, - tokenize=False, - add_generation_prompt=True, - enable_thinking=False - ) - + inputs = self.tokenizer.apply_chat_template(message_list, tokenize=False, add_generation_prompt=True, enable_thinking=False) model_inputs = self.tokenizer( inputs, padding=True, @@ -138,26 +117,30 @@ class InferenceEngine: with torch.inference_mode(): with torch.amp.autocast(device_type=self.device, dtype=torch.bfloat16): outputs = self.model(model_inputs) - - # 1. 计算分类标签(argmax) - preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist() - - # 2. 计算softmax概率(核心修正:转CPU、转numpy、转列表,解决Tensor序列化问题) - outputs_float = outputs.float() # 转换为 float32 避免精度问题 - probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2] - # 转换为CPU的numpy数组,再转列表(每个样本对应2个类别的概率) - probs = probs.cpu().numpy().tolist() - probs = [p[1] for p in probs] # 只保留类别1的概率 - - # 3. 计算置信度 - confidence = [abs(p - 0.5) * 2 for p in probs] - # 返回格式:labels是每个样本的分类标签列表,probs是每个样本的类别概率列表,confidence是每个样本的置信度列表 - return {"labels": preds, "probs": probs, "confidence": confidence} + preds = torch.argmax(outputs, dim=1).cpu().numpy().tolist() + outputs_float = outputs.float() + probs = torch.softmax(outputs_float, dim=1) # probs: [B, 2] + probs = probs.cpu().numpy().tolist() + probs = [p[1] for p in probs] + return {"labels": preds, "probs": probs} if __name__ == "__main__": - # 配置参数 backbone_dir = "Qwen3-1.7B" ckpt_path = "best_ckpt.pth" device = "cuda" - engine = InferenceEngine(backbone_dir, ckpt_path, device) \ No newline at end of file + engine = InferenceEngine(backbone_dir, ckpt_path, device) + import glob + deal_files = glob.glob(os.path.join("filtered_deal", "*.json")) + test_deal_files = deal_files[:4] + not_deal_files = glob.glob(os.path.join("filtered_not_deal", "*.json")) + test_not_deal_files = not_deal_files[:4] + + test_files = test_deal_files + test_not_deal_files + test_dict = [] + for test_file in test_files: + with open(test_file, "r", encoding="utf-8") as f: + json_data = json.load(f) + test_dict.append(json_data) + results = engine.inference_batch_json_data(test_dict) + print(results) \ No newline at end of file From 94d2c2aa9fc3511fa1d40014d445a931b1c1b696 Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 12:16:43 +0800 Subject: [PATCH 11/12] Update data_process/process/content_extract.py --- data_process/process/content_extract.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/data_process/process/content_extract.py b/data_process/process/content_extract.py index cd05e29..b39d019 100644 --- a/data_process/process/content_extract.py +++ b/data_process/process/content_extract.py @@ -1,6 +1,7 @@ import os import glob import json +from typing import List, Dict valid_keys = [ "Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index", @@ -77,6 +78,25 @@ def extract_json_data(json_files: list, threshold: int = 10) -> dict: raise ValueError(f"Invalid key {key} in {json_file}") return filt_json_data(data, threshold) +def load_data_from_dict(data_dict: List[dict]): + """ + 不进行阈值过滤,直接加载数据 + """ + data = {} + for idx, item in enumerate(data_dict): + data[idx] = {} + for key, value in item.items(): + if key in valid_keys: + data[idx][key] = value.get("value", None) + elif key == "Follow_up_Priority": + continue + else: + match_key = try_match_error_key(key) + if match_key: + data[idx][match_key] = value.get("value", None) + else: + print(f"Warning: Invalid key {key} in data dict, skipped.") + return data if __name__=="__main__": deal_folder = "deal" From 2c72c000cd4d81febd83e3049c82a9d2174fd41c Mon Sep 17 00:00:00 2001 From: WangZiFan Date: Fri, 27 Feb 2026 12:17:23 +0800 Subject: [PATCH 12/12] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 96da4c4..6568fd6 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,9 @@ --model/ --__init__.py --modelling.py + --focal_loss.py --inference.py # 推理接口 --train.py ---test.py ---statis_main.py --Qwen3-1.7B/ --best_ckpt.pth ```