Files
onion-dmp/analyze_new_data.py
2026-04-08 14:52:09 +08:00

93 lines
3.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
详细分析清洗1.0.xlsx文件
"""
import openpyxl
from collections import Counter
import json
file_path = '/Users/inkling/Desktop/dmp/清洗1.0.xlsx'
wb = openpyxl.load_workbook(file_path)
ws = wb.active
print("\n" + "="*100)
print("📊 清洗1.0.xlsx 文件详细分析")
print("="*100)
# 获取所有表头
print("\n【表头列表】")
print("-" * 100)
headers = []
for i in range(1, ws.max_column + 1):
header = ws.cell(1, i).value
headers.append(header)
col_letter = chr(64 + i) if i <= 26 else chr(64 + i // 26) + chr(64 + i % 26)
print(f"{i:2d} ({col_letter:2s}): {header}")
print(f"\n📋 总列数:{len(headers)}")
print(f"📋 总行数:{ws.max_row} 行(含表头)")
print(f"📋 数据行数:{ws.max_row - 1}")
# 详细分析每列数据
print("\n" + "="*100)
print("【各列数据详情】")
print("="*100)
for col_idx in range(1, min(ws.max_column + 1, 30)): # 分析前30列
header = headers[col_idx - 1]
if not header:
continue
col_letter = chr(64 + col_idx) if col_idx <= 26 else chr(64 + col_idx // 26) + chr(64 + col_idx % 26)
values = []
non_empty = 0
for row in range(2, min(ws.max_row + 1, 1000)): # 扫描前998行
cell_value = ws[f'{col_letter}{row}'].value
if cell_value is not None and str(cell_value).strip():
non_empty += 1
values.append(str(cell_value).strip())
# 统计唯一值
unique_values = Counter(values)
coverage = non_empty / (ws.max_row - 1) * 100 if ws.max_row > 1 else 0
print(f"\n{col_idx:2d} ({col_letter}): {header}")
print(f" 数据覆盖率:{coverage:.1f}% ({non_empty}/{ws.max_row - 1})")
print(f" 唯一值数:{len(unique_values)}")
# 显示前10个唯一值及其频数
if len(unique_values) <= 15:
print(f" 详细值分布:")
for val, count in unique_values.most_common(15):
print(f"{val:40s} ({count:4d}次)")
else:
print(f" 前15个值分布")
for val, count in unique_values.most_common(15):
print(f"{val:40s} ({count:4d}次)")
print(f" ... 共{len(unique_values)}个唯一值")
print("\n" + "="*100)
print("【数据质量评估】")
print("="*100)
# 检查是否有明确的ID或KEY字段
print("\n存在以下可能的ID字段值基本不重复")
for col_idx in range(1, min(ws.max_column + 1, 30)):
header = headers[col_idx - 1]
col_letter = chr(64 + col_idx) if col_idx <= 26 else chr(64 + col_idx // 26) + chr(64 + col_idx % 26)
values = []
for row in range(2, ws.max_row + 1):
val = ws[f'{col_letter}{row}'].value
if val:
values.append(str(val).strip())
unique = len(set(values))
if unique > 0 and unique / len(values) > 0.9: # 唯一性 > 90%
print(f" • 列{col_idx} ({col_letter}): {header} - {unique}/{len(values)} 唯一值")
print("\n" + "="*100)