diff --git a/data_process/process/content_extract.py b/data_process/process/content_extract.py new file mode 100644 index 0000000..da959e0 --- /dev/null +++ b/data_process/process/content_extract.py @@ -0,0 +1,89 @@ +import os +import glob +import json + +valid_keys = [ + "Core_Fear_Source", "Pain_Threshold", "Time_Window_Pressure", "Helplessness_Index", + "Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity", + "Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage", + "Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus", + "Competitor_Mindset", "Cognitive_Stage", "Follow_up_Priority", "Last_Interaction", "Referral_Potential" +] +ch_valid_keys = [ + "核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数", + "社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度", + "沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏", + "低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成", + "竞争者心态", "认知阶段", "跟进优先级", "最后互动时间", "推荐潜力" +] +all_keys = valid_keys + ["session_id", "label"] +en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)} +d1_keys = valid_keys[:5] +d2_keys = valid_keys[5:10] +d3_keys = valid_keys[10:15] +d4_keys = valid_keys[15:19] +d5_keys = valid_keys[19:23] + +def extract_json_files(folder: str): + json_files = glob.glob(os.path.join(folder, "*.json")) + print(f"Total {len(json_files)} json files in {folder}") + return json_files + +def extract_txt_files(folder: str): + txt_files = glob.glob(os.path.join(folder, "*.txt")) + print(f"Total {len(txt_files)} txt files in {folder}") + return txt_files + +def try_match_error_key(error_key: str): + error_key_parts = error_key.split("_") + part_one2full = {} + for key in valid_keys: + key_parts = key.split("_") + part_one2full[key_parts[0]] = key + if error_key_parts[0] in part_one2full: + match_key = part_one2full[error_key_parts[0]] + print(f"Match {error_key} to {match_key}") + return match_key + else: + return None + +def filt_json_data(json_data: dict): + new_json_data = {} + + for k, v in json_data.items(): + if len(v) >= 10: + new_json_data[k] = v + print(f"Total {len(new_json_data)} json keys after filter") + return new_json_data + + +def extract_json_data(json_files: list) -> dict: + data = {} + for json_file in json_files: + session_id = os.path.basename(json_file).split(".")[0] + data[session_id] = {} + with open(json_file, "r", encoding="utf-8") as f: + json_data = json.load(f) + for key, value in json_data.items(): + if key in valid_keys: + data[session_id][key] = value['value'] + else: + match_key = try_match_error_key(key) + if match_key: + data[session_id][match_key] = value['value'] + else: + raise ValueError(f"Invalid key {key} in {json_file}") + return filt_json_data(data) + + +if __name__=="__main__": + deal_folder = "deal" + not_deal_folder = "not_deal" + + deal_json_files = extract_json_files(deal_folder) + deal_data = extract_json_data(deal_json_files) + deal_txt_files = extract_txt_files(deal_folder) + + not_deal_json_files = extract_json_files(not_deal_folder) + not_deal_data = extract_json_data(not_deal_json_files) + not_deal_txt_files = extract_txt_files(not_deal_folder)