import os import glob import json from typing import List, Dict valid_keys = [ "Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index", "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", "Competitor_Mindset", "Cognitive_Stage", "Last_Interaction", "Referral_Potential" ] ch_valid_keys = [ "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", "竞争者心态", "认知阶段", "最后互动时间", "推荐潜力" ] all_keys = valid_keys + ["session_id", "label"] en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} d1_keys = valid_keys[:5] d2_keys = valid_keys[5:10] d3_keys = valid_keys[10:15] d4_keys = valid_keys[15:19] d5_keys = valid_keys[19:22] def extract_json_files(folder: str): json_files = glob.glob(os.path.join(folder, "*.json")) print(f"Total {len(json_files)} json files in {folder}") return json_files def extract_txt_files(folder: str): txt_files = glob.glob(os.path.join(folder, "*.txt")) print(f"Total {len(txt_files)} txt files in {folder}") return txt_files def try_match_error_key(error_key: str): error_key_parts = error_key.split("_") part_one2full = {} for key in valid_keys: key_parts = key.split("_") part_one2full[key_parts[0]] = key if error_key_parts[0] in part_one2full: match_key = part_one2full[error_key_parts[0]] print(f"Match {error_key} to {match_key}") return match_key else: return None def filt_json_data(json_data: dict, threshold: int = 10): new_json_data = {} for k, v in json_data.items(): if len(v) >= threshold and len(v) != 0: new_json_data[k] = v print(f"Total {len(new_json_data)} json data after filter with threshold {threshold}") return new_json_data def extract_json_data(json_files: list, threshold: int = 10) -> dict: data = {} for json_file in json_files: session_id = os.path.basename(json_file).split(".")[0] data[session_id] = {} with open(json_file, "r", encoding="utf-8") as f: json_data = json.load(f) for key, value in json_data.items(): if key in valid_keys: data[session_id][key] = value.get("value", None) elif key == "Follow_up_Priority": continue else: match_key = try_match_error_key(key) if match_key: data[session_id][match_key] = value.get("value", None) else: raise ValueError(f"Invalid key {key} in {json_file}") return filt_json_data(data, threshold) def load_data_from_dict(data_dict: List[dict]): """ 不进行阈值过滤,直接加载数据 """ data = {} for idx, item in enumerate(data_dict): data[idx] = {} for key, value in item.items(): if key in valid_keys: data[idx][key] = value.get("value", None) elif key == "Follow_up_Priority": continue else: match_key = try_match_error_key(key) if match_key: data[idx][match_key] = value.get("value", None) else: print(f"Warning: Invalid key {key} in data dict, skipped.") return data if __name__=="__main__": deal_folder = "deal" not_deal_folder = "not_deal"