Update data_process/process/content_extract.py
This commit is contained in:
@@ -7,14 +7,14 @@ valid_keys = [
|
|||||||
"Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity",
|
"Social_Shame", "Payer_Decision_Maker", "Hidden_Wealth_Proof", "Price_Sensitivity",
|
||||||
"Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage",
|
"Sunk_Cost", "Compensatory_Spending", "Trust_Deficit", "Secret_Resistance", "Family_Sabotage",
|
||||||
"Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus",
|
"Low_Self_Efficacy", "Attribution_Barrier", "Emotional_Trigger", "Ultimatum_Event", "Expectation_Bonus",
|
||||||
"Competitor_Mindset", "Cognitive_Stage", "Follow_up_Priority", "Last_Interaction", "Referral_Potential"
|
"Competitor_Mindset", "Cognitive_Stage", "Last_Interaction", "Referral_Potential"
|
||||||
]
|
]
|
||||||
ch_valid_keys = [
|
ch_valid_keys = [
|
||||||
"核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数",
|
"核心恐惧源", "疼痛阈值", "时间窗口压力", "无助指数",
|
||||||
"社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度",
|
"社会羞耻感", "付款决策者", "隐藏财富证明", "价格敏感度",
|
||||||
"沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏",
|
"沉没成本", "补偿性消费", "信任赤字", "秘密抵触情绪", "家庭破坏",
|
||||||
"低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成",
|
"低自我效能感", "归因障碍", "情绪触发点", "最后通牒事件", "期望加成",
|
||||||
"竞争者心态", "认知阶段", "跟进优先级", "最后互动时间", "推荐潜力"
|
"竞争者心态", "认知阶段", "最后互动时间", "推荐潜力"
|
||||||
]
|
]
|
||||||
all_keys = valid_keys + ["session_id", "label"]
|
all_keys = valid_keys + ["session_id", "label"]
|
||||||
en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)}
|
en2ch = {en:ch for en, ch in zip(valid_keys, ch_valid_keys)}
|
||||||
@@ -22,7 +22,7 @@ d1_keys = valid_keys[:5]
|
|||||||
d2_keys = valid_keys[5:10]
|
d2_keys = valid_keys[5:10]
|
||||||
d3_keys = valid_keys[10:15]
|
d3_keys = valid_keys[10:15]
|
||||||
d4_keys = valid_keys[15:19]
|
d4_keys = valid_keys[15:19]
|
||||||
d5_keys = valid_keys[19:23]
|
d5_keys = valid_keys[19:22]
|
||||||
|
|
||||||
def extract_json_files(folder: str):
|
def extract_json_files(folder: str):
|
||||||
json_files = glob.glob(os.path.join(folder, "*.json"))
|
json_files = glob.glob(os.path.join(folder, "*.json"))
|
||||||
@@ -47,17 +47,17 @@ def try_match_error_key(error_key: str):
|
|||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def filt_json_data(json_data: dict):
|
def filt_json_data(json_data: dict, threshold: int = 10):
|
||||||
new_json_data = {}
|
new_json_data = {}
|
||||||
|
|
||||||
for k, v in json_data.items():
|
for k, v in json_data.items():
|
||||||
if len(v) >= 10:
|
if len(v) >= threshold and len(v) != 0:
|
||||||
new_json_data[k] = v
|
new_json_data[k] = v
|
||||||
print(f"Total {len(new_json_data)} json keys after filter")
|
print(f"Total {len(new_json_data)} json data after filter with threshold {threshold}")
|
||||||
return new_json_data
|
return new_json_data
|
||||||
|
|
||||||
|
|
||||||
def extract_json_data(json_files: list) -> dict:
|
def extract_json_data(json_files: list, threshold: int = 10) -> dict:
|
||||||
data = {}
|
data = {}
|
||||||
for json_file in json_files:
|
for json_file in json_files:
|
||||||
session_id = os.path.basename(json_file).split(".")[0]
|
session_id = os.path.basename(json_file).split(".")[0]
|
||||||
@@ -66,24 +66,18 @@ def extract_json_data(json_files: list) -> dict:
|
|||||||
json_data = json.load(f)
|
json_data = json.load(f)
|
||||||
for key, value in json_data.items():
|
for key, value in json_data.items():
|
||||||
if key in valid_keys:
|
if key in valid_keys:
|
||||||
data[session_id][key] = value['value']
|
data[session_id][key] = value.get("value", None)
|
||||||
|
elif key == "Follow_up_Priority":
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
match_key = try_match_error_key(key)
|
match_key = try_match_error_key(key)
|
||||||
if match_key:
|
if match_key:
|
||||||
data[session_id][match_key] = value['value']
|
data[session_id][match_key] = value.get("value", None)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid key {key} in {json_file}")
|
raise ValueError(f"Invalid key {key} in {json_file}")
|
||||||
return filt_json_data(data)
|
return filt_json_data(data, threshold)
|
||||||
|
|
||||||
|
|
||||||
if __name__=="__main__":
|
if __name__=="__main__":
|
||||||
deal_folder = "deal"
|
deal_folder = "deal"
|
||||||
not_deal_folder = "not_deal"
|
not_deal_folder = "not_deal"
|
||||||
|
|
||||||
deal_json_files = extract_json_files(deal_folder)
|
|
||||||
deal_data = extract_json_data(deal_json_files)
|
|
||||||
deal_txt_files = extract_txt_files(deal_folder)
|
|
||||||
|
|
||||||
not_deal_json_files = extract_json_files(not_deal_folder)
|
|
||||||
not_deal_data = extract_json_data(not_deal_json_files)
|
|
||||||
not_deal_txt_files = extract_txt_files(not_deal_folder)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user