Files
onion-dmp/scripts/quality-check-1.py
2026-04-08 14:52:09 +08:00

125 lines
4.0 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""全面质量检查脚本"""
import openpyxl
import sqlite3
print("\n" + "="*70)
print("🔍 全面质量检查")
print("="*70 + "\n")
# ============================================================================
# 1. Excel 文件对比
# ============================================================================
print("1⃣ EXCEL 文件结构和内容对比")
print("-"*70 + "\n")
wb0 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/家庭教育档案-天数.xlsx')
wb1 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗1.0.xlsx')
wb2 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗2.0.xlsx')
ws0, ws1, ws2 = wb0.active, wb1.active, wb2.active
print(f"📊 行列统计:")
print(f" 原始(家庭教育档案-天数): {ws0.max_row} rows × {ws0.max_column} cols")
print(f" 清洗1.0: {ws1.max_row} rows × {ws1.max_column} cols")
print(f" 清洗2.0: {ws2.max_row} rows × {ws2.max_column} cols")
# 列结构对比
print(f"\n📋 列结构对比:")
print(f" {'':<3} {'原始':<25} {'清洗1.0':<25} {'清洗2.0':<25} {'状态':<5}")
print(f" {'-'*3} {'-'*25} {'-'*25} {'-'*25} {'-'*5}")
for col in range(1, 17):
h0 = str(ws0.cell(1, col).value or '')[:22]
h1 = str(ws1.cell(1, col).value or '')[:22]
h2 = str(ws2.cell(1, col).value or '')[:22]
match = "" if h1 == h2 else ""
print(f" {col:<3} {h0:<25} {h1:<25} {h2:<25} {match:<5}")
# 数据完整性检查
print(f"\n✅ 数据完整性 (前100行检查):")
def check_null_rate(ws, start_col=1, end_col=16, rows=100):
results = {}
for col in range(start_col, min(end_col + 1, ws.max_column + 1)):
nulls = 0
total = 0
for row in range(2, min(rows + 2, ws.max_row + 1)):
total += 1
if ws.cell(row, col).value is None:
nulls += 1
if total > 0:
results[col] = (nulls, total, 100 * nulls / total)
return results
nulls1 = check_null_rate(ws1)
nulls2 = check_null_rate(ws2)
print(f" 清洗1.0: ", end="")
if all(rate == 0 for _, _, rate in nulls1.values()):
print("✓ 完全无缺失值")
else:
for col, (n, t, rate) in sorted(nulls1.items()):
if rate > 0:
print(f"{col}({rate:.0f}%) ", end="")
print(f"\n 清洗2.0: ", end="")
if all(rate == 0 for _, _, rate in nulls2.values()):
print("✓ 完全无缺失值")
else:
for col, (n, t, rate) in sorted(nulls2.items()):
if rate > 0:
print(f"{col}({rate:.0f}%) ", end="")
print()
# ============================================================================
# 2. 数据库内容检查
# ============================================================================
print(f"\n\n2⃣ 数据库内容检查")
print("-"*70 + "\n")
conn = sqlite3.connect('/Users/inkling/Desktop/dmp/dmp_onion.db')
cursor = conn.cursor()
# 用户数据
cursor.execute('SELECT COUNT(*) FROM users')
user_count = cursor.fetchone()[0]
print(f"👥 用户数: {user_count}")
# 标签数据
cursor.execute('SELECT COUNT(*) FROM tags')
tag_count = cursor.fetchone()[0]
print(f"🏷️ 标签数: {tag_count}")
# 分类数据
cursor.execute('SELECT COUNT(*) FROM tag_categories')
cat_count = cursor.fetchone()[0]
print(f"📂 分类数: {cat_count}")
# 关系数据
cursor.execute('SELECT COUNT(*) FROM user_tags')
rel_count = cursor.fetchone()[0]
print(f"🔗 关系数: {rel_count}")
# 分类分布
print(f"\n📊 标签分类分布:")
cursor.execute('''
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count,
COUNT(DISTINCT ut.user_id) as user_count,
COUNT(ut.id) as rel_count
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.id
''')
for row in cursor.fetchall():
name, tags, users, rels = row
coverage = f"{(users*100/user_count):.0f}%" if users else "0%"
print(f"{name:<20} {tags:3d} tags, {users:4d} users ({coverage:>3s}), {rels:5d} relations")
conn.close()
print("\n" + "="*70)