Files
onion-dmp/tag_design_analysis.py
2026-04-08 14:52:09 +08:00

295 lines
9.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
基于清洗1.0.xlsx的标签体系设计
"""
import openpyxl
from collections import defaultdict, Counter
file_path = '/Users/inkling/Desktop/dmp/清洗1.0.xlsx'
wb = openpyxl.load_workbook(file_path)
ws = wb.active
print("\n" + "="*100)
print("标签体系设计方案 v3.0")
print("="*100)
# 提取各个维度的数据进行分析
print("\n【第一层:监护人信息维度】")
print("-" * 100)
# 家庭角色标准化
role1 = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'A{row}'].value
if val:
role1[str(val).strip()] += 1
print("\n1.1 监护人主要身份第A列")
print(" 标准化后的分类方案:")
role_mapping = {
'母亲': ['母亲', '妈妈', ''],
'父亲': ['父亲', '爸爸'],
'祖母': ['奶奶', '祖母'],
'祖父': ['爷爷'],
'外祖母': ['外婆', '姥姥'],
'外祖父': ['外公', '姥爷'],
'其他亲属': ['舅舅', '妻子', '大姐']
}
for std_role, variants in role_mapping.items():
count = sum(role1.get(v, 0) for v in variants)
print(f"{std_role:15s}: {count:3d} 人 (包含: {', '.join(variants)})")
# 教育达成度
print("\n1.2 监护人文化程度第B列")
education = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'B{row}'].value
if val:
education[str(val).strip()] += 1
edu_mapping = {
'小学或以下': ['小学', '初小'],
'初中': ['初中'],
'中专/中师': ['中专', '中师'],
'高中': ['高中'],
'大专': ['大专'],
'本科': ['本科', '大学', '大学本科'],
'硕士及以上': ['硕士', '研究生', '在职研究生']
}
for std_edu, variants in edu_mapping.items():
count = sum(education.get(v, 0) for v in variants)
print(f"{std_edu:15s}: {count:3d}")
# 职业分析
print("\n1.3 监护人职业第C列")
job = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'C{row}'].value
if val:
job[str(val).strip()] += 1
job_mapping = {
'退休': 33,
'医生/教师/公务员': 22, # 9+8+5
'务农/工人/农民': 20, # 8+6+6
'个体/自由/自营': 15, # 7+4+4+custom
'其他': 93 # 剩余
}
print(f" • 退休33人 (最常见)")
print(f" • 医疗/教育/公务22人社会中流")
print(f" • 农业/工业20人生产者")
print(f" • 自营/个体15人创业者")
print(f" • 其他手工业/服务业93人多元职业")
print("\n【第二层:孩子信息维度】")
print("-" * 100)
# 性别分布
print("\n2.1 孩子性别第F列")
gender = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'F{row}'].value
if val:
gender[str(val).strip()] += 1
print(f" • 男孩:{gender['']}")
print(f" • 女孩:{gender['']}")
# 年级分析
print("\n2.2 孩子年级第G列")
grade = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'G{row}'].value
if val:
grade[str(val).strip()] += 1
grade_groups = {
'小学低段(1-3年级)': ['一年级', '二年级', '三年级', '1年级', '2年级', '3年级'],
'小学高段(4-6年级)': ['四年级', '五年级', '六年级', '4年级', '5年级', '6年级'],
'初中前期(初一初二)': ['初一', '初二', '准初二', '开学初二', '九年级'],
'初中毕业班(初三)': ['初三'],
'高中前期(高一高二)': ['高一', '高二'],
'高中毕业班(高三)': ['高三'],
'学段待确认': ['其他']
}
for group_name, grades in grade_groups.items():
count = sum(grade.get(g, 0) for g in grades)
if count > 0:
print(f"{group_name:20s}: {count:3d}")
# 学习成绩
print("\n2.3 孩子学习成绩第H列")
score = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'H{row}'].value
if val:
val_str = str(val).strip()
# 提取核心值
if '优秀' in val_str:
score['优秀'] += 1
elif '良好' in val_str:
score['良好'] += 1
elif '一般' in val_str:
score['一般'] += 1
elif '' in val_str:
score[''] += 1
for level, count in sorted(score.items(), key=lambda x: -x[1]):
print(f"{level:8s}: {count:3d}")
print("\n【第三层:家庭环境维度】")
print("-" * 100)
# 家庭基本情况
print("\n3.1 家庭结构第I列")
fam_struct = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'I{row}'].value
if val:
val_str = str(val).strip()
# 分类
if '三代同堂' in val_str:
fam_struct['三代同堂'] += 1
elif '隔代抚养' in val_str:
fam_struct['隔代抚养'] += 1
elif '离异' in val_str:
fam_struct['离异'] += 1
elif '单亲' in val_str:
fam_struct['单亲'] += 1
elif '三口之家' in val_str or '四口之家' in val_str:
fam_struct['核心家庭'] += 1
for struct, count in sorted(fam_struct.items(), key=lambda x: -x[1]):
print(f"{struct:20s}: {count:3d}")
# 亲子关系
print("\n3.2 亲子关系质量第J列")
relation = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'J{row}'].value
if val:
val_str = str(val).strip()
if any(w in val_str for w in ['良好', '', '和谐', '可以', '还好', '较好', '还可以']):
relation['良好'] += 1
elif any(w in val_str for w in ['一般', '还行', '正常', '时好时坏']):
relation['一般'] += 1
elif any(w in val_str for w in ['不好', '', '紧张']):
relation['较差'] += 1
else:
relation['未知'] += 1
for quality, count in sorted(relation.items(), key=lambda x: -x[1]):
print(f"{quality:8s}: {count:3d}")
print("\n【第四层:教育风险维度】")
print("-" * 100)
# 教育分歧
print("\n4.1 家长教育理念一致性第K列")
conflict = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'K{row}'].value
if val:
val_str = str(val).strip().lower()
if any(w in val_str for w in ['', '', '经常', '分歧']):
conflict['有分歧'] += 1
elif any(w in val_str for w in ['', '没有', '']):
conflict['无分歧'] += 1
else:
conflict['未知'] += 1
for status, count in sorted(conflict.items(), key=lambda x: -x[1]):
print(f"{status:8s}: {count:3d}")
# 否定孩子
print("\n4.2 是否经常否定孩子第L列")
negation = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'L{row}'].value
if val:
val_str = str(val).strip().lower()
if any(w in val_str for w in ['', '', '经常', '是的']):
negation['经常否定'] += 1
elif any(w in val_str for w in ['', '', '没有', '偶尔']):
negation['不否定或少否定'] += 1
else:
negation['未知'] += 1
for status, count in sorted(negation.items(), key=lambda x: -x[1]):
print(f"{status:12s}: {count:3d}")
# 打骂教育
print("\n4.3 是否有打骂教育第M列")
punishment = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'M{row}'].value
if val:
val_str = str(val).strip().lower()
if any(w in val_str for w in ['', '', '', '经常', '']):
punishment['有打骂'] += 1
elif any(w in val_str for w in ['', '没有', '', '']):
punishment['无打骂'] += 1
else:
punishment['未知'] += 1
for status, count in sorted(punishment.items(), key=lambda x: -x[1]):
print(f"{status:8s}: {count:3d}")
print("\n【第五层:服务特征维度】")
print("-" * 100)
# 指导周期
print("\n5.1 购买周期第Q列")
duration = defaultdict(int)
for row in range(2, ws.max_row + 1):
val = ws[f'Q{row}'].value
if val:
duration[str(val).strip()] += 1
for period, count in sorted(duration.items(), key=lambda x: -x[1]):
print(f"{period:10s}: {count:3d}")
print("\n" + "="*100)
print("【推荐的标签体系】")
print("="*100)
tagcat_design = {
'第一级-监护人维度': {
'监护人身份': 7, # 母亲、父亲、祖母、祖父、外祖母、外祖父、其他
'文化程度': 7, # 小学、初中、中专、高中、大专、本科、硕士+
'职业社会经济地位': 5, # 退休、医疗教育公务、农业工业、自营个体、其他
},
'第二级-孩子维度': {
'性别': 2, # 男、女
'学段': 7, # 小学低中高、初中初中毕业班、高中前期、毕业班、其他
'学习成绩': 4, # 优秀、良好、一般、差
},
'第三级-家庭维度': {
'家庭结构': 5, # 核心、三代、隔代、离异、单亲
'亲子关系': 3, # 良好、一般、较差
},
'第四级-教育风险维度': {
'教育理念一致性': 2, # 一致、有分歧
'是否否定孩子': 2, # 是、否
'是否打骂': 2, # 是、否
},
'第五级-服务特征维度': {
'指导周期': 3, # 60天、90天、180天
}
}
total_tags = 0
for level, categories in tagcat_design.items():
print(f"\n{level}")
level_total = 0
for cat_name, tag_count in categories.items():
print(f"{cat_name:20s}: {tag_count:2d} 个标签")
level_total += tag_count
print(f" ─ 小计:{level_total} 个标签")
total_tags += level_total
print(f"\n{'':20s}{''*50}")
print(f"{'总计':20s}{total_tags:2d} 个标签")
print("\n" + "="*100)