Update README and project cleanup
This commit is contained in:
92
analyze_new_data.py
Normal file
92
analyze_new_data.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
详细分析清洗1.0.xlsx文件
|
||||
"""
|
||||
|
||||
import openpyxl
|
||||
from collections import Counter
|
||||
import json
|
||||
|
||||
file_path = '/Users/inkling/Desktop/dmp/清洗1.0.xlsx'
|
||||
wb = openpyxl.load_workbook(file_path)
|
||||
ws = wb.active
|
||||
|
||||
print("\n" + "="*100)
|
||||
print("📊 清洗1.0.xlsx 文件详细分析")
|
||||
print("="*100)
|
||||
|
||||
# 获取所有表头
|
||||
print("\n【表头列表】")
|
||||
print("-" * 100)
|
||||
headers = []
|
||||
for i in range(1, ws.max_column + 1):
|
||||
header = ws.cell(1, i).value
|
||||
headers.append(header)
|
||||
col_letter = chr(64 + i) if i <= 26 else chr(64 + i // 26) + chr(64 + i % 26)
|
||||
print(f"{i:2d} ({col_letter:2s}): {header}")
|
||||
|
||||
print(f"\n📋 总列数:{len(headers)} 列")
|
||||
print(f"📋 总行数:{ws.max_row} 行(含表头)")
|
||||
print(f"📋 数据行数:{ws.max_row - 1} 行")
|
||||
|
||||
# 详细分析每列数据
|
||||
print("\n" + "="*100)
|
||||
print("【各列数据详情】")
|
||||
print("="*100)
|
||||
|
||||
for col_idx in range(1, min(ws.max_column + 1, 30)): # 分析前30列
|
||||
header = headers[col_idx - 1]
|
||||
if not header:
|
||||
continue
|
||||
|
||||
col_letter = chr(64 + col_idx) if col_idx <= 26 else chr(64 + col_idx // 26) + chr(64 + col_idx % 26)
|
||||
values = []
|
||||
non_empty = 0
|
||||
|
||||
for row in range(2, min(ws.max_row + 1, 1000)): # 扫描前998行
|
||||
cell_value = ws[f'{col_letter}{row}'].value
|
||||
if cell_value is not None and str(cell_value).strip():
|
||||
non_empty += 1
|
||||
values.append(str(cell_value).strip())
|
||||
|
||||
# 统计唯一值
|
||||
unique_values = Counter(values)
|
||||
coverage = non_empty / (ws.max_row - 1) * 100 if ws.max_row > 1 else 0
|
||||
|
||||
print(f"\n列{col_idx:2d} ({col_letter}): {header}")
|
||||
print(f" 数据覆盖率:{coverage:.1f}% ({non_empty}/{ws.max_row - 1})")
|
||||
print(f" 唯一值数:{len(unique_values)}")
|
||||
|
||||
# 显示前10个唯一值及其频数
|
||||
if len(unique_values) <= 15:
|
||||
print(f" 详细值分布:")
|
||||
for val, count in unique_values.most_common(15):
|
||||
print(f" • {val:40s} ({count:4d}次)")
|
||||
else:
|
||||
print(f" 前15个值分布:")
|
||||
for val, count in unique_values.most_common(15):
|
||||
print(f" • {val:40s} ({count:4d}次)")
|
||||
print(f" ... 共{len(unique_values)}个唯一值")
|
||||
|
||||
print("\n" + "="*100)
|
||||
print("【数据质量评估】")
|
||||
print("="*100)
|
||||
|
||||
# 检查是否有明确的ID或KEY字段
|
||||
print("\n存在以下可能的ID字段(值基本不重复):")
|
||||
for col_idx in range(1, min(ws.max_column + 1, 30)):
|
||||
header = headers[col_idx - 1]
|
||||
col_letter = chr(64 + col_idx) if col_idx <= 26 else chr(64 + col_idx // 26) + chr(64 + col_idx % 26)
|
||||
|
||||
values = []
|
||||
for row in range(2, ws.max_row + 1):
|
||||
val = ws[f'{col_letter}{row}'].value
|
||||
if val:
|
||||
values.append(str(val).strip())
|
||||
|
||||
unique = len(set(values))
|
||||
if unique > 0 and unique / len(values) > 0.9: # 唯一性 > 90%
|
||||
print(f" • 列{col_idx} ({col_letter}): {header} - {unique}/{len(values)} 唯一值")
|
||||
|
||||
print("\n" + "="*100)
|
||||
Reference in New Issue
Block a user