Update README and project cleanup

This commit is contained in:
inkling
2026-04-08 14:52:09 +08:00
commit fafd267288
71 changed files with 14865 additions and 0 deletions

124
scripts/quality-check-1.py Normal file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""全面质量检查脚本"""
import openpyxl
import sqlite3
print("\n" + "="*70)
print("🔍 全面质量检查")
print("="*70 + "\n")
# ============================================================================
# 1. Excel 文件对比
# ============================================================================
print("1⃣ EXCEL 文件结构和内容对比")
print("-"*70 + "\n")
wb0 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/家庭教育档案-天数.xlsx')
wb1 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗1.0.xlsx')
wb2 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗2.0.xlsx')
ws0, ws1, ws2 = wb0.active, wb1.active, wb2.active
print(f"📊 行列统计:")
print(f" 原始(家庭教育档案-天数): {ws0.max_row} rows × {ws0.max_column} cols")
print(f" 清洗1.0: {ws1.max_row} rows × {ws1.max_column} cols")
print(f" 清洗2.0: {ws2.max_row} rows × {ws2.max_column} cols")
# 列结构对比
print(f"\n📋 列结构对比:")
print(f" {'':<3} {'原始':<25} {'清洗1.0':<25} {'清洗2.0':<25} {'状态':<5}")
print(f" {'-'*3} {'-'*25} {'-'*25} {'-'*25} {'-'*5}")
for col in range(1, 17):
h0 = str(ws0.cell(1, col).value or '')[:22]
h1 = str(ws1.cell(1, col).value or '')[:22]
h2 = str(ws2.cell(1, col).value or '')[:22]
match = "" if h1 == h2 else ""
print(f" {col:<3} {h0:<25} {h1:<25} {h2:<25} {match:<5}")
# 数据完整性检查
print(f"\n✅ 数据完整性 (前100行检查):")
def check_null_rate(ws, start_col=1, end_col=16, rows=100):
results = {}
for col in range(start_col, min(end_col + 1, ws.max_column + 1)):
nulls = 0
total = 0
for row in range(2, min(rows + 2, ws.max_row + 1)):
total += 1
if ws.cell(row, col).value is None:
nulls += 1
if total > 0:
results[col] = (nulls, total, 100 * nulls / total)
return results
nulls1 = check_null_rate(ws1)
nulls2 = check_null_rate(ws2)
print(f" 清洗1.0: ", end="")
if all(rate == 0 for _, _, rate in nulls1.values()):
print("✓ 完全无缺失值")
else:
for col, (n, t, rate) in sorted(nulls1.items()):
if rate > 0:
print(f"{col}({rate:.0f}%) ", end="")
print(f"\n 清洗2.0: ", end="")
if all(rate == 0 for _, _, rate in nulls2.values()):
print("✓ 完全无缺失值")
else:
for col, (n, t, rate) in sorted(nulls2.items()):
if rate > 0:
print(f"{col}({rate:.0f}%) ", end="")
print()
# ============================================================================
# 2. 数据库内容检查
# ============================================================================
print(f"\n\n2⃣ 数据库内容检查")
print("-"*70 + "\n")
conn = sqlite3.connect('/Users/inkling/Desktop/dmp/dmp_onion.db')
cursor = conn.cursor()
# 用户数据
cursor.execute('SELECT COUNT(*) FROM users')
user_count = cursor.fetchone()[0]
print(f"👥 用户数: {user_count}")
# 标签数据
cursor.execute('SELECT COUNT(*) FROM tags')
tag_count = cursor.fetchone()[0]
print(f"🏷️ 标签数: {tag_count}")
# 分类数据
cursor.execute('SELECT COUNT(*) FROM tag_categories')
cat_count = cursor.fetchone()[0]
print(f"📂 分类数: {cat_count}")
# 关系数据
cursor.execute('SELECT COUNT(*) FROM user_tags')
rel_count = cursor.fetchone()[0]
print(f"🔗 关系数: {rel_count}")
# 分类分布
print(f"\n📊 标签分类分布:")
cursor.execute('''
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count,
COUNT(DISTINCT ut.user_id) as user_count,
COUNT(ut.id) as rel_count
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.id
''')
for row in cursor.fetchall():
name, tags, users, rels = row
coverage = f"{(users*100/user_count):.0f}%" if users else "0%"
print(f"{name:<20} {tags:3d} tags, {users:4d} users ({coverage:>3s}), {rels:5d} relations")
conn.close()
print("\n" + "="*70)