Files
yangcong_22_feature/data/object_convert.py
2026-04-02 10:50:57 +08:00

60 lines
2.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 将0-家长1-销售转换
import os
import re
# 原始TXT文件所在目录你的data文件夹路径
INPUT_DIR = "./data_new/processed_txt_files"
# 处理后文件保存目录(自动创建,与原文件同名)
OUTPUT_DIR = "./data_role_replaced"
# 正则表达式匹配行首的“0:”或“1:”(确保只改角色标识,不改文本内容)
ROLE_PATTERN = re.compile(r'^([01]):', re.MULTILINE) # re.MULTILINE让^匹配每行开头
def replace_role_in_txt(txt_path, output_path):
"""
处理单个TXT文件将行首的0:→家长:1:→销售:
:param txt_path: 原始TXT路径
:param output_path: 处理后TXT保存路径
"""
# 1. 读取原始文件内容
with open(txt_path, 'r', encoding='utf-8') as f:
content = f.read()
# 2. 替换角色标识0→家长1→销售
def replace_match(match):
role_code = match.group(1) # 获取匹配到的“0”或“1”
return "家长:" if role_code == "0" else "销售:"
# 用自定义函数替换所有匹配项
replaced_content = ROLE_PATTERN.sub(replace_match, content)
# 3. 保存处理后的文件
with open(output_path, 'w', encoding='utf-8') as f:
f.write(replaced_content)
print(f"处理完成:{os.path.basename(txt_path)}")
def batch_replace_all_txt():
"""批量处理INPUT_DIR下所有TXT文件"""
# 1. 创建输出目录(不存在则自动创建)
os.makedirs(OUTPUT_DIR, exist_ok=True)
# 2. 筛选目录下所有TXT文件
txt_files = [f for f in os.listdir(INPUT_DIR) if f.endswith('.txt')]
if not txt_files:
print(f"未在 {INPUT_DIR} 目录找到TXT文件请检查路径")
return
# 3. 逐个处理TXT文件
print(f"共发现 {len(txt_files)} 个TXT文件开始批量替换角色...")
for txt_filename in txt_files:
input_path = os.path.join(INPUT_DIR, txt_filename)
output_path = os.path.join(OUTPUT_DIR, txt_filename)
replace_role_in_txt(input_path, output_path)
print(f"\n全部处理完成!文件已保存至:{os.path.abspath(OUTPUT_DIR)}")
if __name__ == "__main__":
batch_replace_all_txt()