Update README and project cleanup
This commit is contained in:
70
scripts/analyze-excel.py
Normal file
70
scripts/analyze-excel.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
import openpyxl
|
||||
|
||||
# Load both files
|
||||
wb1 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗1.0.xlsx')
|
||||
wb2 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗2.0.xlsx')
|
||||
|
||||
ws1 = wb1.active
|
||||
ws2 = wb2.active
|
||||
|
||||
# Get first row of data from each file
|
||||
print("清洗1.0 - First 3 users (columns 1-7):")
|
||||
for row in range(2, 5):
|
||||
cols = []
|
||||
for col in range(1, 8):
|
||||
cols.append(ws1.cell(row, col).value)
|
||||
print(f" Row {row}: {cols}")
|
||||
|
||||
print("\n清洗2.0 - First 3 users (columns 1-7):")
|
||||
for row in range(2, 5):
|
||||
cols = []
|
||||
for col in range(1, 8):
|
||||
cols.append(ws2.cell(row, col).value)
|
||||
print(f" Row {row}: {cols}")
|
||||
|
||||
# Check if same users exist
|
||||
print("\n清洗1.0 中的家庭角色值:")
|
||||
roles_1 = set()
|
||||
for row in range(2, ws1.max_row + 1):
|
||||
val = ws1.cell(row, 1).value
|
||||
if val:
|
||||
roles_1.add(str(val).strip())
|
||||
|
||||
print(f"Unique values: {len(roles_1)}")
|
||||
|
||||
print("\n清洗2.0 中的家庭角色值:")
|
||||
roles_2 = set()
|
||||
for row in range(2, ws2.max_row + 1):
|
||||
val = ws2.cell(row, 1).value
|
||||
if val:
|
||||
roles_2.add(str(val).strip())
|
||||
|
||||
print(f"Unique values: {len(roles_2)}")
|
||||
|
||||
print(f"\nOverlap analysis:")
|
||||
print(f"Matching roles: {len(roles_1 & roles_2)}")
|
||||
print(f"Unique to 1.0: {len(roles_1 - roles_2)}")
|
||||
print(f"Unique to 2.0: {len(roles_2 - roles_1)}")
|
||||
|
||||
# Check column mapping - create a unique key per row from columns 1-7
|
||||
def make_key(ws, row):
|
||||
key_parts = []
|
||||
for col in range(1, 8):
|
||||
val = ws.cell(row, col).value
|
||||
key_parts.append(str(val) if val is not None else "")
|
||||
return "|".join(key_parts)
|
||||
|
||||
print("\nChecking row overlap by first 7 columns:")
|
||||
keys_1 = set()
|
||||
for row in range(2, ws1.max_row + 1):
|
||||
keys_1.add(make_key(ws1, row))
|
||||
|
||||
keys_2 = set()
|
||||
for row in range(2, ws2.max_row + 1):
|
||||
keys_2.add(make_key(ws2, row))
|
||||
|
||||
overlap = len(keys_1 & keys_2)
|
||||
print(f"Matching rows: {overlap}")
|
||||
print(f"Total rows 1.0: {len(keys_1)}")
|
||||
print(f"Total rows 2.0: {len(keys_2)}")
|
||||
223
scripts/clean-family-role-noise-v2.js
Normal file
223
scripts/clean-family-role-noise-v2.js
Normal file
@@ -0,0 +1,223 @@
|
||||
const { getDb } = require('../db/init');
|
||||
|
||||
const db = getDb('onion');
|
||||
|
||||
const CATEGORY_KEY = 'basic_info_role';
|
||||
|
||||
const RENAME_MAP = new Map([
|
||||
// 妈妈系
|
||||
['母', '妈妈'],
|
||||
['妈', '妈妈'],
|
||||
['母亲', '妈妈'],
|
||||
['母 亲', '妈妈'],
|
||||
['母親', '妈妈'],
|
||||
['毋亲', '妈妈'],
|
||||
['妈 妈', '妈妈'],
|
||||
['妈吗', '妈妈'],
|
||||
['妈好', '妈妈'],
|
||||
['妈专', '妈妈'],
|
||||
['蚂妈', '妈妈'],
|
||||
['宝妈', '妈妈'],
|
||||
['全职妈妈', '妈妈'],
|
||||
['家庭主妇', '妈妈'],
|
||||
['主妇', '妈妈'],
|
||||
['家家庭主妇', '妈妈'],
|
||||
['女主人', '妈妈'],
|
||||
|
||||
// 爸爸系
|
||||
['父', '爸爸'],
|
||||
['爸', '爸爸'],
|
||||
['父亲', '爸爸'],
|
||||
['父 亲', '爸爸'],
|
||||
['孩子爸', '爸爸'],
|
||||
['爸专', '爸爸'],
|
||||
['爸备', '爸爸'],
|
||||
|
||||
// 祖辈系
|
||||
['祖父', '爷爷'],
|
||||
['姥爷', '外公'],
|
||||
['外爷', '外公'],
|
||||
['祖母', '奶奶'],
|
||||
['姥姥', '外婆'],
|
||||
['姥姥/外婆', '外婆'],
|
||||
['外婆', '外婆'],
|
||||
['婆婆', '奶奶'],
|
||||
|
||||
// 其他明确亲属
|
||||
['姑妈', '姑姑'],
|
||||
]);
|
||||
|
||||
// 这些值属于家庭角色中的明确亲属关系,保留即可
|
||||
const KEEP_SET = new Set([
|
||||
'妈妈', '爸爸', '爷爷', '奶奶', '外公', '外婆',
|
||||
'姑姑', '舅舅', '姨妈', '伯娘', '继母', '妻子',
|
||||
'女儿', '儿子', '姐姐', '父母', '家长', '其他监护人',
|
||||
]);
|
||||
|
||||
// 明显不是家庭角色的噪声、描述、乱码、占位符
|
||||
const DELETE_EXACT = new Set([
|
||||
'上班族', '母性', '女', '主', '主妇', '全职', '母中', '母女', '母子',
|
||||
'一般', '陪读', '父母', '母家', '高中', '经济', '无', '目前', '内勤',
|
||||
'带娃', '白黑', '家长', '全能', '次', '普通', '好人', '主导', '主角',
|
||||
'主内', '主&角初中', '初中', '文 化', '/', 'I', '13296773713',
|
||||
'盛自根', '经济支柱', '经济、教育、生活是核心', '助推庭教育',
|
||||
'呵护,做具体事', '教育陪伴孩子', '照孩子', '家庭主妇', '家家庭主妇',
|
||||
'妈专', '妈好', '妈吗', '妈 妈', '父 亲', '妈 亲', '母 亲', '母親',
|
||||
'母', '父', '爸', '孩子爸', '爸专', '爸备', '宝妈', '蚂妈', '毋亲',
|
||||
'外爷', '姥爷', '祖父', '祖母', '姑妈', '婆婆', '女主人', '母亲',
|
||||
]);
|
||||
|
||||
const DELETE_PATTERNS = [
|
||||
/^\d+$/, // 数字
|
||||
/^[\s\W_]+$/, // 纯符号/空白
|
||||
/联系方式|电话|手机号|微信/, // 联系方式片段
|
||||
/上班|内勤|经济|教育|陪伴|助推|呵护|主导|主角|全能|普通|一般|目前|无|好人|次/,
|
||||
/家庭主妇|主妇|全职|陪读|带娃/,
|
||||
/文化|初中|高中|白黑|盛自根/,
|
||||
];
|
||||
|
||||
function canonicalizeName(rawName) {
|
||||
const name = String(rawName || '').trim();
|
||||
if (!name) return null;
|
||||
if (RENAME_MAP.has(name)) return RENAME_MAP.get(name);
|
||||
return name;
|
||||
}
|
||||
|
||||
function shouldDelete(name) {
|
||||
if (DELETE_EXACT.has(name)) return true;
|
||||
return DELETE_PATTERNS.some((re) => re.test(name));
|
||||
}
|
||||
|
||||
function updateStats(dbConn) {
|
||||
const totalUsers = dbConn.prepare('SELECT COUNT(*) AS n FROM users').get().n || 1;
|
||||
const tags = dbConn.prepare('SELECT id FROM tags').all();
|
||||
const stmt = dbConn.prepare(`
|
||||
UPDATE tags
|
||||
SET
|
||||
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
|
||||
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
|
||||
WHERE id = ?
|
||||
`);
|
||||
for (const tag of tags) stmt.run(totalUsers, tag.id);
|
||||
}
|
||||
|
||||
function main() {
|
||||
try {
|
||||
const category = db.prepare('SELECT id FROM tag_categories WHERE key = ?').get(CATEGORY_KEY);
|
||||
if (!category) throw new Error(`找不到分类: ${CATEGORY_KEY}`);
|
||||
|
||||
const catId = category.id;
|
||||
const tags = db.prepare('SELECT id, name FROM tags WHERE category_id = ?').all(catId);
|
||||
|
||||
console.log('🧹 开始清理家庭角色噪声数据...');
|
||||
console.log(`📂 当前标签数: ${tags.length}`);
|
||||
|
||||
let merged = 0;
|
||||
let deleted = 0;
|
||||
let kept = 0;
|
||||
|
||||
const tx = db.transaction(() => {
|
||||
const getByName = db.prepare('SELECT id, name FROM tags WHERE category_id = ? AND name = ?');
|
||||
const insertRel = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
|
||||
const deleteRel = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
|
||||
const deleteTag = db.prepare('DELETE FROM tags WHERE id = ?');
|
||||
const updateTag = db.prepare('UPDATE tags SET name = ? WHERE id = ?');
|
||||
|
||||
for (const tag of tags) {
|
||||
const originalName = String(tag.name || '').trim();
|
||||
const canonicalName = canonicalizeName(originalName);
|
||||
|
||||
if (KEEP_SET.has(originalName)) {
|
||||
kept += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (canonicalName && canonicalName !== originalName && KEEP_SET.has(canonicalName)) {
|
||||
const target = getByName.get(catId, canonicalName);
|
||||
if (target) {
|
||||
// 先把关系迁移过去,再删除旧标签
|
||||
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`✅ 合并: ${originalName} -> ${canonicalName}`);
|
||||
merged += 1;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// 未在保留名单中:如果是明显噪声则删除
|
||||
if (shouldDelete(originalName)) {
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`🗑️ 删除: ${originalName}`);
|
||||
deleted += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
// 其他未明确规则的值:保守处理,保留但不改名
|
||||
kept += 1;
|
||||
}
|
||||
|
||||
// 额外处理:把一些未能通过 canonicalize 但明显可归类到妈妈/爸爸的值再扫一遍
|
||||
const leftovers = db.prepare('SELECT id, name FROM tags WHERE category_id = ?').all(catId);
|
||||
for (const tag of leftovers) {
|
||||
const name = String(tag.name || '').trim();
|
||||
if (KEEP_SET.has(name)) continue;
|
||||
if (/妈|母|宝妈/.test(name)) {
|
||||
const target = getByName.get(catId, '妈妈');
|
||||
if (target && target.id !== tag.id) {
|
||||
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`✅ 合并(兜底): ${name} -> 妈妈`);
|
||||
merged += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (/爸|父|孩子爸/.test(name)) {
|
||||
const target = getByName.get(catId, '爸爸');
|
||||
if (target && target.id !== tag.id) {
|
||||
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`✅ 合并(兜底): ${name} -> 爸爸`);
|
||||
merged += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
tx();
|
||||
updateStats(db);
|
||||
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM tags t JOIN tag_categories c ON c.id=t.category_id WHERE c.key = ?) AS tag_count,
|
||||
(SELECT COUNT(*) FROM user_tags) AS rel_count,
|
||||
(SELECT COUNT(*) FROM tags WHERE name = '妈妈') AS mom_count,
|
||||
(SELECT COUNT(*) FROM tags WHERE name = '爸爸') AS dad_count,
|
||||
(SELECT COUNT(*) FROM tags WHERE name = '爷爷') AS grandpa_count,
|
||||
(SELECT COUNT(*) FROM tags WHERE name = '外公') AS mgp_count,
|
||||
(SELECT COUNT(*) FROM tags WHERE name = '外婆') AS mgm_count
|
||||
`).get(CATEGORY_KEY);
|
||||
|
||||
console.log('\n✨ 清理完成');
|
||||
console.log(` • 合并: ${merged}`);
|
||||
console.log(` • 删除: ${deleted}`);
|
||||
console.log(` • 保留(未改名): ${kept}`);
|
||||
console.log(` • 家庭角色标签剩余: ${stats.tag_count}`);
|
||||
console.log(` • 妈妈/爸爸/爷爷/外公/外婆 计数: ${stats.mom_count}/${stats.dad_count}/${stats.grandpa_count}/${stats.mgp_count}/${stats.mgm_count}`);
|
||||
|
||||
db.close();
|
||||
} catch (error) {
|
||||
console.error('❌ 清理失败:', error);
|
||||
try { db.close(); } catch (_) {}
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
74
scripts/cleanup-invalid-tags.js
Normal file
74
scripts/cleanup-invalid-tags.js
Normal file
@@ -0,0 +1,74 @@
|
||||
const { getDb } = require('../db/init');
|
||||
const db = getDb('onion');
|
||||
|
||||
// 清理明显是错误或不相关的标签(家庭角色分类中)
|
||||
const FAMILY_ROLE_INVALID_TAGS = [
|
||||
'初中', // 学段标签,不是家庭角色
|
||||
'大姐', // 不是主要家庭角色
|
||||
'舅舅', // 叔舅角色,范围太小
|
||||
'妻子', // 不是孩子相关的家庭角色
|
||||
'母亲相当单亲家庭', // 错误数据
|
||||
'母子', // 不是标准家庭角色
|
||||
'女儿', // 这应该在不同分类
|
||||
'文 化', // 完全无关
|
||||
'*', // 符号
|
||||
];
|
||||
|
||||
function cleanupInvalidTags() {
|
||||
try {
|
||||
console.log('🧹 开始清理无效标签...\n');
|
||||
|
||||
let deletedCount = 0;
|
||||
|
||||
// 删除标签
|
||||
for (const tagName of FAMILY_ROLE_INVALID_TAGS) {
|
||||
const tag = db.prepare('SELECT id FROM tags WHERE name = ?').get(tagName);
|
||||
|
||||
if (tag) {
|
||||
const userCount = db.prepare(
|
||||
'SELECT COUNT(DISTINCT user_id) as count FROM user_tags WHERE tag_id = ?'
|
||||
).get(tag.id);
|
||||
|
||||
db.prepare('DELETE FROM user_tags WHERE tag_id = ?').run(tag.id);
|
||||
db.prepare('DELETE FROM tags WHERE id = ?').run(tag.id);
|
||||
|
||||
console.log(`✅ 删除: "${tagName}" (${userCount?.count || 0} 用户)`);
|
||||
deletedCount++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✨ 清理完成!deleted: ${deletedCount}`);
|
||||
|
||||
// 显示最终状态
|
||||
const finalCount = db.prepare('SELECT COUNT(*) as count FROM tags').get();
|
||||
const relationCount = db.prepare('SELECT COUNT(*) as count FROM user_tags').get();
|
||||
|
||||
console.log(`\n📊 最终状态:`);
|
||||
console.log(` • 剩余标签总数: ${finalCount.count}`);
|
||||
console.log(` • 用户-标签关系总数: ${relationCount.count}`);
|
||||
|
||||
// 显示家庭角色分类的最新标签
|
||||
console.log(`\n📋 家庭角色分类标签列表:`);
|
||||
const finalTags = db.prepare(
|
||||
`SELECT name, coverage, coverage_rate
|
||||
FROM tags
|
||||
WHERE category_id = (SELECT id FROM tag_categories WHERE name = '家庭角色')
|
||||
ORDER BY coverage DESC`
|
||||
).all();
|
||||
|
||||
finalTags.forEach((tag, idx) => {
|
||||
console.log(` ${idx + 1}. ${tag.name}: ${tag.coverage} 用户 (${tag.coverage_rate}%)`);
|
||||
});
|
||||
|
||||
console.log(`\n✨ 总计: ${finalTags.length} 个家庭角色标签`);
|
||||
|
||||
db.close();
|
||||
process.exit(0);
|
||||
} catch (error) {
|
||||
console.error('❌ 错误:', error);
|
||||
db.close();
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
cleanupInvalidTags();
|
||||
39
scripts/fix-category-order.js
Normal file
39
scripts/fix-category-order.js
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const Database = require('better-sqlite3');
|
||||
const path = require('path');
|
||||
|
||||
const dbPath = path.join(__dirname, '../dmp_onion.db');
|
||||
const db = new Database(dbPath);
|
||||
|
||||
console.log('修复分类顺序...\n');
|
||||
|
||||
// 重新设置所有sort_order
|
||||
const updates = [
|
||||
{ id: 46, sort: 0, name: '家庭角色' },
|
||||
{ id: 34, sort: 1, name: '用户年龄段标签' },
|
||||
{ id: 35, sort: 2, name: '孩子学段标签' },
|
||||
{ id: 36, sort: 3, name: '家庭结构标签' },
|
||||
{ id: 37, sort: 4, name: '教育风险标签' },
|
||||
{ id: 38, sort: 5, name: '家庭支持度标签' },
|
||||
{ id: 39, sort: 6, name: '付费能力标签' },
|
||||
{ id: 40, sort: 7, name: '需求紧迫度标签' },
|
||||
{ id: 41, sort: 8, name: '核心问题标签' },
|
||||
{ id: 42, sort: 9, name: '干预难度标签' },
|
||||
{ id: 43, sort: 10, name: '转化优先级标签' },
|
||||
{ id: 44, sort: 11, name: '渠道适配标签' },
|
||||
{ id: 45, sort: 12, name: '产品匹配标签' },
|
||||
{ id: 47, sort: 13, name: '文化程度' },
|
||||
{ id: 48, sort: 14, name: '服务周期标签' }
|
||||
];
|
||||
|
||||
const stmt = db.prepare('UPDATE tag_categories SET sort_order = ? WHERE id = ?');
|
||||
|
||||
for (const item of updates) {
|
||||
stmt.run(item.sort, item.id);
|
||||
console.log(`${item.sort + 1}. ${item.name}`);
|
||||
}
|
||||
|
||||
console.log('\n✅ 完成!');
|
||||
|
||||
db.close();
|
||||
94
scripts/fix-duplicate-category.js
Normal file
94
scripts/fix-duplicate-category.js
Normal file
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* 修复分类重复问题
|
||||
* 1. 删除"用户身份标签"分类及其所有标签和关系
|
||||
* 2. 把"家庭角色"移到第一个位置
|
||||
* 3. 调整其他分类的sort_order
|
||||
*/
|
||||
|
||||
const Database = require('better-sqlite3');
|
||||
const path = require('path');
|
||||
|
||||
const dbPath = path.join(__dirname, '../dmp_onion.db');
|
||||
const db = new Database(dbPath);
|
||||
|
||||
console.log('\n╔════════════════════════════════════════════════════════════════╗');
|
||||
console.log('║ 🔧 修复分类重复问题 ║');
|
||||
console.log('╚════════════════════════════════════════════════════════════════╝\n');
|
||||
|
||||
try {
|
||||
// 1. 获取用户身份标签的所有标签ID
|
||||
console.log('1️⃣ 获取\"用户身份标签\"的所有标签...');
|
||||
const tagIds = db.prepare('SELECT id FROM tags WHERE category_id = 33').all();
|
||||
console.log(` 找到 ${tagIds.length} 个标签`);
|
||||
|
||||
// 2. 删除相关的user_tags关系
|
||||
console.log('\n2️⃣ 删除user_tags关系...');
|
||||
const stmt = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
|
||||
let relDeleted = 0;
|
||||
for (const tag of tagIds) {
|
||||
const result = stmt.run(tag.id);
|
||||
relDeleted += result.changes;
|
||||
}
|
||||
console.log(` 删除了 ${relDeleted} 条关系`);
|
||||
|
||||
// 3. 删除tags
|
||||
console.log('\n3️⃣ 删除标签...');
|
||||
const tagDeleteResult = db.prepare('DELETE FROM tags WHERE category_id = 33').run();
|
||||
console.log(` 删除了 ${tagDeleteResult.changes} 个标签`);
|
||||
|
||||
// 4. 删除分类
|
||||
console.log('\n4️⃣ 删除分类...');
|
||||
const catDeleteResult = db.prepare('DELETE FROM tag_categories WHERE id = 33').run();
|
||||
console.log(` 删除了 ${catDeleteResult.changes} 个分类`);
|
||||
|
||||
// 5. 更新家庭角色的sort_order到0
|
||||
console.log('\n5️⃣ 更新\"家庭角色\"的位置...');
|
||||
db.prepare('UPDATE tag_categories SET sort_order = 0 WHERE id = 46').run();
|
||||
console.log(' ✓ 家庭角色现在排在第一位');
|
||||
|
||||
// 6. 重新调整其他分类的sort_order
|
||||
console.log('\n6️⃣ 重新调整其他分类的顺序...');
|
||||
const categories = db.prepare('SELECT id, key, name, sort_order FROM tag_categories ORDER BY sort_order').all();
|
||||
|
||||
let newOrder = 0;
|
||||
for (const cat of categories) {
|
||||
if (cat.id === 46) continue; // 家庭角色已经是0
|
||||
if (cat.sort_order !== newOrder) {
|
||||
db.prepare('UPDATE tag_categories SET sort_order = ? WHERE id = ?').run(newOrder, cat.id);
|
||||
}
|
||||
newOrder++;
|
||||
}
|
||||
console.log(` ✓ 调整了 ${newOrder} 个分类`);
|
||||
|
||||
// 7. 显示最终结果
|
||||
console.log('\n7️⃣ 最终分类列表:');
|
||||
const finalCats = db.prepare('SELECT id, key, name, sort_order FROM tag_categories ORDER BY sort_order').all();
|
||||
for (const cat of finalCats) {
|
||||
console.log(` ${cat.sort_order + 1}. ${cat.name} (ID:${cat.id})`);
|
||||
}
|
||||
|
||||
// 8. 统计数据
|
||||
console.log('\n📊 数据统计:');
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM users) as 总用户,
|
||||
(SELECT COUNT(*) FROM tags) as 总标签,
|
||||
(SELECT COUNT(*) FROM tag_categories) as 分类数,
|
||||
(SELECT COUNT(*) FROM user_tags) as 总关系
|
||||
`).get();
|
||||
|
||||
console.log(` • 总用户: ${stats.总用户}`);
|
||||
console.log(` • 总标签: ${stats.总标签}`);
|
||||
console.log(` • 分类数: ${stats.分类数} (从16减少到15)`);
|
||||
console.log(` • 总关系: ${stats.总关系}`);
|
||||
|
||||
console.log('\n✅ 修复完成!\n');
|
||||
|
||||
} catch (e) {
|
||||
console.error('❌ 错误:', e.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
db.close();
|
||||
}
|
||||
140
scripts/fix-family-role-canonical-names.js
Normal file
140
scripts/fix-family-role-canonical-names.js
Normal file
@@ -0,0 +1,140 @@
|
||||
const { getDb } = require('../db/init');
|
||||
|
||||
const db = getDb('onion');
|
||||
const CATEGORY_KEY = 'basic_info_role';
|
||||
|
||||
const FATHER_SYNONYMS = ['父', '爸', '父 亲', '孩子爸', '爸专', '爸备'];
|
||||
const GRANDPA_SYNONYMS = ['姥爷', '外爷'];
|
||||
const GRANDMA_SYNONYMS = ['姥姥', '姥姥/外婆'];
|
||||
const GRANDSON_SYNONYMS = [];
|
||||
|
||||
function updateStats(dbConn) {
|
||||
const totalUsers = dbConn.prepare('SELECT COUNT(*) AS n FROM users').get().n || 1;
|
||||
const tags = dbConn.prepare('SELECT id FROM tags').all();
|
||||
const stmt = dbConn.prepare(`
|
||||
UPDATE tags
|
||||
SET
|
||||
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
|
||||
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
|
||||
WHERE id = ?
|
||||
`);
|
||||
for (const tag of tags) stmt.run(totalUsers, tag.id);
|
||||
}
|
||||
|
||||
function main() {
|
||||
try {
|
||||
const category = db.prepare('SELECT id FROM tag_categories WHERE key = ?').get(CATEGORY_KEY);
|
||||
if (!category) throw new Error(`找不到分类: ${CATEGORY_KEY}`);
|
||||
const catId = category.id;
|
||||
|
||||
const getTag = db.prepare('SELECT id, name FROM tags WHERE category_id = ? AND name = ?');
|
||||
const renameTag = db.prepare('UPDATE tags SET name = ? WHERE id = ?');
|
||||
const mergeRel = db.prepare(`
|
||||
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
SELECT user_id, ? FROM user_tags WHERE tag_id = ?
|
||||
`);
|
||||
const deleteRel = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
|
||||
const deleteTag = db.prepare('DELETE FROM tags WHERE id = ?');
|
||||
|
||||
const tx = db.transaction(() => {
|
||||
// 1) 保证标准名存在:爸爸、外婆
|
||||
let dad = getTag.get(catId, '爸爸');
|
||||
const father = getTag.get(catId, '父亲');
|
||||
if (!dad && father) {
|
||||
renameTag.run('爸爸', father.id);
|
||||
dad = { id: father.id, name: '爸爸' };
|
||||
console.log('✅ 重命名: 父亲 -> 爸爸');
|
||||
}
|
||||
|
||||
let grandma = getTag.get(catId, '外婆');
|
||||
const extGrandma = getTag.get(catId, '姥姥/外婆');
|
||||
if (!grandma && extGrandma) {
|
||||
renameTag.run('外婆', extGrandma.id);
|
||||
grandma = { id: extGrandma.id, name: '外婆' };
|
||||
console.log('✅ 重命名: 姥姥/外婆 -> 外婆');
|
||||
}
|
||||
|
||||
// 2) 合并爸爸系
|
||||
dad = getTag.get(catId, '爸爸');
|
||||
if (dad) {
|
||||
for (const synonym of FATHER_SYNONYMS) {
|
||||
const tag = getTag.get(catId, synonym);
|
||||
if (!tag || tag.id === dad.id) continue;
|
||||
mergeRel.run(dad.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`✅ 合并: ${synonym} -> 爸爸`);
|
||||
}
|
||||
}
|
||||
|
||||
// 3) 合并祖辈
|
||||
const grandpa = getTag.get(catId, '爷爷');
|
||||
if (grandpa) {
|
||||
for (const synonym of GRANDPA_SYNONYMS) {
|
||||
const tag = getTag.get(catId, synonym);
|
||||
if (!tag || tag.id === grandpa.id) continue;
|
||||
mergeRel.run(grandpa.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`✅ 合并: ${synonym} -> 爷爷`);
|
||||
}
|
||||
}
|
||||
|
||||
const grandma2 = getTag.get(catId, '奶奶');
|
||||
if (grandma2) {
|
||||
const tag = getTag.get(catId, '婆婆');
|
||||
if (tag && tag.id !== grandma2.id) {
|
||||
mergeRel.run(grandma2.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log('✅ 合并: 婆婆 -> 奶奶');
|
||||
}
|
||||
}
|
||||
|
||||
grandma = getTag.get(catId, '外婆');
|
||||
if (grandma) {
|
||||
for (const synonym of GRANDMA_SYNONYMS) {
|
||||
const tag = getTag.get(catId, synonym);
|
||||
if (!tag || tag.id === grandma.id) continue;
|
||||
mergeRel.run(grandma.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`✅ 合并: ${synonym} -> 外婆`);
|
||||
}
|
||||
}
|
||||
|
||||
// 4) 外公系
|
||||
const grandpa2 = getTag.get(catId, '外公');
|
||||
if (grandpa2) {
|
||||
const tag = getTag.get(catId, '姥爷');
|
||||
if (tag && tag.id !== grandpa2.id) {
|
||||
mergeRel.run(grandpa2.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log('✅ 合并: 姥爷 -> 外公');
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
tx();
|
||||
updateStats(db);
|
||||
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM tags t JOIN tag_categories c ON c.id=t.category_id WHERE c.key = ?) AS tag_count,
|
||||
(SELECT COUNT(*) FROM user_tags) AS rel_count
|
||||
`).get(CATEGORY_KEY);
|
||||
|
||||
console.log('\n✨ 标准名修复完成');
|
||||
console.log(` • 家庭角色标签剩余: ${stats.tag_count}`);
|
||||
console.log(` • 用户-标签关系总数: ${stats.rel_count}`);
|
||||
|
||||
db.close();
|
||||
} catch (error) {
|
||||
console.error('❌ 标准名修复失败:', error);
|
||||
try { db.close(); } catch (_) {}
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
80
scripts/fix-family-role-final-slimdown.js
Normal file
80
scripts/fix-family-role-final-slimdown.js
Normal file
@@ -0,0 +1,80 @@
|
||||
const { getDb } = require('../db/init');
|
||||
|
||||
const db = getDb('onion');
|
||||
const CATEGORY_KEY = 'basic_info_role';
|
||||
|
||||
const MERGE_TO_OTHER = ['家长', '父母'];
|
||||
const DELETE_ONLY = ['妻子', '女儿', '姐姐', '儿子'];
|
||||
|
||||
function updateStats(dbConn) {
|
||||
const totalUsers = dbConn.prepare('SELECT COUNT(*) AS n FROM users').get().n || 1;
|
||||
const stmt = dbConn.prepare(`
|
||||
UPDATE tags
|
||||
SET
|
||||
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
|
||||
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
|
||||
WHERE id = ?
|
||||
`);
|
||||
|
||||
const tagIds = dbConn.prepare('SELECT id FROM tags').all();
|
||||
for (const tag of tagIds) stmt.run(totalUsers, tag.id);
|
||||
}
|
||||
|
||||
function main() {
|
||||
try {
|
||||
const category = db.prepare('SELECT id FROM tag_categories WHERE key = ?').get(CATEGORY_KEY);
|
||||
if (!category) throw new Error(`找不到分类: ${CATEGORY_KEY}`);
|
||||
const catId = category.id;
|
||||
|
||||
const getTag = db.prepare('SELECT id, name FROM tags WHERE category_id = ? AND name = ?');
|
||||
const mergeRel = db.prepare(`
|
||||
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
SELECT user_id, ? FROM user_tags WHERE tag_id = ?
|
||||
`);
|
||||
const deleteRel = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
|
||||
const deleteTag = db.prepare('DELETE FROM tags WHERE id = ?');
|
||||
|
||||
const other = getTag.get(catId, '其他监护人');
|
||||
if (!other) throw new Error('找不到“其他监护人”标签,无法合并');
|
||||
|
||||
const tx = db.transaction(() => {
|
||||
for (const name of MERGE_TO_OTHER) {
|
||||
const tag = getTag.get(catId, name);
|
||||
if (!tag || tag.id === other.id) continue;
|
||||
mergeRel.run(other.id, tag.id);
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`✅ 合并: ${name} -> 其他监护人`);
|
||||
}
|
||||
|
||||
for (const name of DELETE_ONLY) {
|
||||
const tag = getTag.get(catId, name);
|
||||
if (!tag) continue;
|
||||
deleteRel.run(tag.id);
|
||||
deleteTag.run(tag.id);
|
||||
console.log(`🗑️ 删除: ${name}`);
|
||||
}
|
||||
});
|
||||
|
||||
tx();
|
||||
updateStats(db);
|
||||
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM tags t JOIN tag_categories c ON c.id=t.category_id WHERE c.key = ?) AS tag_count,
|
||||
(SELECT COUNT(*) FROM user_tags) AS rel_count
|
||||
`).get(CATEGORY_KEY);
|
||||
|
||||
console.log('\n✨ 二次收敛完成');
|
||||
console.log(` • 家庭角色标签剩余: ${stats.tag_count}`);
|
||||
console.log(` • 用户-标签关系总数: ${stats.rel_count}`);
|
||||
|
||||
db.close();
|
||||
} catch (error) {
|
||||
console.error('❌ 二次收敛失败:', error);
|
||||
try { db.close(); } catch (_) {}
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
84
scripts/fix-tag-coverage.js
Normal file
84
scripts/fix-tag-coverage.js
Normal file
@@ -0,0 +1,84 @@
|
||||
/**
|
||||
* 修复标签覆盖率统计
|
||||
* 更新所有标签的coverage和coverage_rate字段
|
||||
*/
|
||||
|
||||
const { getDb } = require('../db/init');
|
||||
|
||||
function updateTagStats(dbSuffix = 'onion') {
|
||||
const db = getDb(dbSuffix);
|
||||
|
||||
try {
|
||||
// 获取总用户数
|
||||
const totalUsersRow = db.prepare('SELECT COUNT(*) as n FROM users').get();
|
||||
const totalUsers = totalUsersRow.n;
|
||||
|
||||
if (totalUsers === 0) {
|
||||
console.error('❌ 没有用户数据');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\n🔄 更新标签覆盖率统计(总用户数: ${totalUsers})`);
|
||||
|
||||
// 获取所有标签
|
||||
const tags = db.prepare('SELECT id FROM tags').all();
|
||||
|
||||
let updated = 0;
|
||||
const stmt = db.prepare(`
|
||||
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
|
||||
`);
|
||||
|
||||
for (const tag of tags) {
|
||||
// 计算该标签的覆盖用户数
|
||||
const coverageRow = db.prepare(`
|
||||
SELECT COUNT(DISTINCT user_id) as cnt FROM user_tags WHERE tag_id = ?
|
||||
`).get(tag.id);
|
||||
|
||||
const coverage = coverageRow.cnt || 0;
|
||||
const coverage_rate = totalUsers > 0 ? +(coverage / totalUsers * 100).toFixed(2) : 0;
|
||||
|
||||
stmt.run(coverage, coverage_rate, tag.id);
|
||||
updated++;
|
||||
|
||||
if (updated % 50 === 0) {
|
||||
console.log(` ✓ 已更新 ${updated} 个标签...`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n✅ 更新完成: ${updated} 个标签\n`);
|
||||
|
||||
// 显示样本
|
||||
console.log('📊 样本数据(前5个标签):');
|
||||
const samples = db.prepare(`
|
||||
SELECT id, name, coverage, coverage_rate FROM tags LIMIT 5
|
||||
`).all();
|
||||
|
||||
for (const sample of samples) {
|
||||
console.log(` • ${sample.name}: ${sample.coverage} users (${sample.coverage_rate}%)`);
|
||||
}
|
||||
|
||||
// 显示统计
|
||||
console.log('\n📊 整体统计:');
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
MIN(coverage) as min_coverage,
|
||||
MAX(coverage) as max_coverage,
|
||||
ROUND(AVG(coverage), 2) as avg_coverage,
|
||||
COUNT(*) as total_tags
|
||||
FROM tags
|
||||
`).get();
|
||||
|
||||
console.log(` • 总标签数: ${stats.total_tags}`);
|
||||
console.log(` • 覆盖范围: ${stats.min_coverage} - ${stats.max_coverage} 用户`);
|
||||
console.log(` • 平均覆盖: ${stats.avg_coverage} 用户`);
|
||||
|
||||
db.close();
|
||||
} catch (e) {
|
||||
console.error('❌ 错误:', e.message);
|
||||
db.close();
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// 执行更新
|
||||
updateTagStats();
|
||||
291
scripts/generate-missing-tags.js
Normal file
291
scripts/generate-missing-tags.js
Normal file
@@ -0,0 +1,291 @@
|
||||
/**
|
||||
* 为清洗2.0中的所有用户生成标签
|
||||
*
|
||||
* 策略:对于没有标签的用户,基于其他列的值生成标签
|
||||
* - 用户年龄段标签 <- 年龄(列4)
|
||||
* - 孩子学段标签 <- 年级(列7)
|
||||
* - 教育风险标签 <- 综合判断
|
||||
* 等
|
||||
*/
|
||||
|
||||
const ExcelJS = require('exceljs');
|
||||
const path = require('path');
|
||||
const { getDb } = require('../db/init');
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
console.log('\n╔════════════════════════════════════════════════════════════════╗');
|
||||
console.log('║ 🏷️ 生成缺失的标签数据 ║');
|
||||
console.log('╚════════════════════════════════════════════════════════════════╝\n');
|
||||
|
||||
const db = getDb('onion');
|
||||
|
||||
// 读取清洗2.0 找出缺失标签的用户
|
||||
console.log('📖 读取清洗2.0.xlsx...');
|
||||
const wb = new ExcelJS.Workbook();
|
||||
await wb.xlsx.readFile(path.join(__dirname, '../清洗2.0.xlsx'));
|
||||
const ws = wb.worksheets[0];
|
||||
|
||||
// 标签生成规则
|
||||
const TAG_GENERATORS = {
|
||||
user_age_group: (row) => {
|
||||
// 列4:年龄
|
||||
const age = parseInt(row.values[4]);
|
||||
if (!age || isNaN(age)) return null;
|
||||
if (age < 30) return '年轻(25-35岁)';
|
||||
if (age < 45) return '中青年(35-45岁)';
|
||||
if (age < 55) return '中年(45-55岁)';
|
||||
if (age < 65) return '中老年(55-65岁)';
|
||||
return '低龄老年(65-75岁)';
|
||||
},
|
||||
|
||||
child_grade: (row) => {
|
||||
// 列7:年级
|
||||
const grade = row.values[7];
|
||||
if (!grade) return null;
|
||||
const gradeStr = String(grade).toLowerCase();
|
||||
if (gradeStr.includes('幼') || gradeStr.includes('小学')) {
|
||||
if (gradeStr.includes('低')) return '小学低段(1-3年级)';
|
||||
return '小学高段(4-6年级)';
|
||||
}
|
||||
if (gradeStr.includes('初')) return '初中前期(初一初二)';
|
||||
if (gradeStr.includes('高')) return '高中前期(高一高二)';
|
||||
return '小学高段(4-6年级)';
|
||||
},
|
||||
|
||||
family_structure: (row) => {
|
||||
// 列1:家庭角色, 列5:家庭角色_2
|
||||
const role = String(row.values[1] || '');
|
||||
const role2 = String(row.values[5] || '');
|
||||
|
||||
const hasGrandparents = role.includes('祖') || role.includes('外祖') || role2.includes('祖') || role2.includes('外祖');
|
||||
const isSingleParent = role.includes('单亲') || role.includes('离异');
|
||||
|
||||
if (isSingleParent) return '离异家庭+隔代抚养-双重风险';
|
||||
if (hasGrandparents) return '三代同堂-传统大家庭';
|
||||
return '核心家庭-父母直接养育';
|
||||
},
|
||||
|
||||
education_risk: (row) => {
|
||||
// 综合判断:综合多个因素
|
||||
const score = [
|
||||
(String(row.values[8] || '').includes('差') ? 3 : 0), // 学习成绩
|
||||
(String(row.values[12] || '').includes('是') ? 2 : 0), // 否定孩子
|
||||
(String(row.values[13] || '').includes('是') ? 3 : 0) // 打骂教育
|
||||
].reduce((a, b) => a + b, 0);
|
||||
|
||||
if (score >= 5) return '高风险(5分)';
|
||||
if (score >= 3) return '中高风险(3分)';
|
||||
return '低风险(1分)';
|
||||
},
|
||||
|
||||
family_support: (row) => {
|
||||
// 亲子关系、有无分歧
|
||||
const relation = String(row.values[10] || '');
|
||||
const divergence = String(row.values[11] || '');
|
||||
|
||||
const score = [
|
||||
(relation.includes('良好') ? 2 : 0),
|
||||
(divergence.includes('是') ? -1 : 1)
|
||||
].reduce((a, b) => a + b, 0);
|
||||
|
||||
if (score >= 2) return '高支持度(5分)';
|
||||
if (score >= 1) return '中等支持度(3分)';
|
||||
return '低支持度(2分)';
|
||||
},
|
||||
|
||||
payment_ability: (row) => {
|
||||
// 职业、年龄(推断收入)
|
||||
const profession = String(row.values[3] || '');
|
||||
const education = String(row.values[2] || '');
|
||||
|
||||
const highProf = ['医', '律', '教授', '总监', '经理', '总经理', 'CFO'].some(x => profession.includes(x));
|
||||
const highEdu = education.includes('硕') || education.includes('博');
|
||||
|
||||
if (highProf || highEdu) return '高付费能力(4分)';
|
||||
if (profession.includes('企业') || profession.includes('工程')) return '中等付费能力(0分)';
|
||||
return '基础付费能力(-2分)';
|
||||
},
|
||||
|
||||
urgency: (row) => {
|
||||
// 学习成绩、手机依赖等
|
||||
const score = String(row.values[8] || '');
|
||||
const behavior = [row.values[12], row.values[13]].map(x => String(x)).join('');
|
||||
|
||||
if (behavior.match(/打|责|否定/)) return '高度紧急(6分)';
|
||||
if (score.includes('差')) return '轻度紧急(1分)';
|
||||
return '常规咨询(0分)';
|
||||
},
|
||||
|
||||
core_problem: (row) => {
|
||||
// 问题描述(列16)
|
||||
const desc = String(row.values[16] || '');
|
||||
if (!desc) return '问题描述不足-需深入了解';
|
||||
if (desc.includes('成绩')) return '【学业】成绩下滑';
|
||||
if (desc.includes('游戏') || desc.includes('手机')) return '【行为】手机/游戏依赖';
|
||||
if (desc.includes('关系')) return '【关系】亲子冲突严重';
|
||||
return '【学业】成绩下滑';
|
||||
},
|
||||
|
||||
intervention_difficulty: (row) => {
|
||||
// 家庭角色分散、教育不当
|
||||
const roles = [row.values[1], row.values[5]].map(x => String(x)).join('|');
|
||||
const education = String([row.values[12], row.values[13]].join(''));
|
||||
|
||||
const score = [
|
||||
(roles.split('|').length > 1 ? 2 : 0),
|
||||
(education.includes('是') ? 3 : 0)
|
||||
].reduce((a, b) => a + b, 0);
|
||||
|
||||
if (score >= 4) return '极高难度(10分)';
|
||||
if (score >= 2) return '中等难度(4分)';
|
||||
return '较低难度(2分)';
|
||||
},
|
||||
|
||||
conversion_priority: (row) => {
|
||||
// 综合优先级
|
||||
const grade = String(row.values[7] || '');
|
||||
const highPriority = grade.includes('高中');
|
||||
return highPriority ? 'B级优先(50分)' : 'C级优先(49分)';
|
||||
},
|
||||
|
||||
channel_adaption: (row) => {
|
||||
// 年龄推断沟通渠道
|
||||
const age = parseInt(row.values[4]);
|
||||
if (age && age > 55) return '电话跟进优先 > 子女协助转化 > 微信语音';
|
||||
return '微信私域 > 电话跟进 > 朋友圈';
|
||||
},
|
||||
|
||||
product_match: (row) => {
|
||||
// 学段匹配产品
|
||||
const grade = String(row.values[7] || '');
|
||||
if (grade.includes('高中')) return '高考压力疏导 + 厌学干预方案';
|
||||
if (grade.includes('初中')) return '青春期应对方案 + 学习动力激活';
|
||||
return '习惯养成课程 + 亲子沟通指导';
|
||||
},
|
||||
|
||||
service_duration: (row) => {
|
||||
// 问题严重程度推断周期
|
||||
const desc = String(row.values[16] || '');
|
||||
if (desc.includes('休学') || desc.includes('辍学')) return '长周期(180天)';
|
||||
return '标准周期(60天)';
|
||||
}
|
||||
};
|
||||
|
||||
// 获取分类ID映射
|
||||
const catIdMap = {};
|
||||
const categories = db.prepare('SELECT id, key FROM tag_categories').all();
|
||||
for (const cat of categories) {
|
||||
catIdMap[cat.key] = cat.id;
|
||||
}
|
||||
|
||||
console.log('');
|
||||
|
||||
// 对每一行生成标签
|
||||
let generated = 0;
|
||||
let inserted = 0;
|
||||
const tagCache = {};
|
||||
|
||||
const insertTagStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
|
||||
VALUES (?, ?, ?, 0, 0, 0)
|
||||
`);
|
||||
|
||||
const getTagIdStmt = db.prepare(`
|
||||
SELECT id FROM tags WHERE category_id = ? AND name = ?
|
||||
`);
|
||||
|
||||
const getOrCreateUserTagStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
VALUES (?, ?)
|
||||
`);
|
||||
|
||||
ws.eachRow((row, rowNum) => {
|
||||
if (rowNum === 1) return; // skip header
|
||||
|
||||
// 获取用户
|
||||
const userKey = `user_${rowNum}`;
|
||||
const user = db.prepare('SELECT id FROM users WHERE uid = ?').get(userKey);
|
||||
if (!user) return;
|
||||
|
||||
// 对每个分类尝试生成标签
|
||||
for (const [catKey, generator] of Object.entries(TAG_GENERATORS)) {
|
||||
try {
|
||||
const tagValue = generator(row);
|
||||
if (!tagValue) continue;
|
||||
|
||||
const catId = catIdMap[catKey];
|
||||
if (!catId) continue;
|
||||
|
||||
// 检查用户是否已有该分类的标签
|
||||
const existing = db.prepare(`
|
||||
SELECT COUNT(*) as cnt FROM user_tags ut
|
||||
JOIN tags t ON ut.tag_id = t.id
|
||||
WHERE ut.user_id = ? AND t.category_id = ?
|
||||
`).get(user.id, catId);
|
||||
|
||||
if (existing.cnt > 0) continue; // 跳过已有标签的
|
||||
|
||||
// 创建或获取标签
|
||||
const cacheKey = `${catId}:${tagValue}`;
|
||||
let tagId = tagCache[cacheKey];
|
||||
|
||||
if (!tagId) {
|
||||
let tag = getTagIdStmt.get(catId, tagValue);
|
||||
if (!tag) {
|
||||
insertTagStmt.run(
|
||||
`${catKey}_${Math.random().toString(36).slice(2)}`,
|
||||
tagValue,
|
||||
catId
|
||||
);
|
||||
tag = getTagIdStmt.get(catId, tagValue);
|
||||
}
|
||||
tagId = tag?.id;
|
||||
if (tagId) tagCache[cacheKey] = tagId;
|
||||
}
|
||||
|
||||
if (tagId) {
|
||||
getOrCreateUserTagStmt.run(user.id, tagId);
|
||||
inserted++;
|
||||
}
|
||||
} catch (e) {
|
||||
// 跳过生成失败的标签
|
||||
}
|
||||
}
|
||||
|
||||
generated++;
|
||||
if (generated % 500 === 0) {
|
||||
console.log(` ✓ 已处理 ${generated} 行...`);
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`\n✅ 标签生成完成:`);
|
||||
console.log(` • 处理用户: ${generated}`);
|
||||
console.log(` • 新增标签链接: ${inserted}`);
|
||||
|
||||
// 显示统计
|
||||
console.log('\n📊 最终标签分布:');
|
||||
const tagStats = db.prepare(`
|
||||
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
|
||||
FROM tag_categories tc
|
||||
LEFT JOIN tags t ON tc.id = t.category_id
|
||||
LEFT JOIN user_tags ut ON t.id = ut.tag_id
|
||||
GROUP BY tc.id
|
||||
ORDER BY tc.id
|
||||
LIMIT 16
|
||||
`).all();
|
||||
|
||||
for (const stat of tagStats) {
|
||||
const coverage = stat.user_count ? Math.round((stat.user_count / 1929) * 100) : 0;
|
||||
console.log(` • ${stat.name.padEnd(20)}: ${stat.tag_count} tags, ${stat.user_count || 0} users (${coverage}%)`);
|
||||
}
|
||||
|
||||
db.close();
|
||||
} catch (e) {
|
||||
console.error('❌ Error:', e.message);
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
382
scripts/import-clean-data-v2.js
Normal file
382
scripts/import-clean-data-v2.js
Normal file
@@ -0,0 +1,382 @@
|
||||
/**
|
||||
* 新数据导入脚本 v4.0
|
||||
* 基于"清洗2.0.xlsx"的完整数据导入
|
||||
*
|
||||
* 特点:
|
||||
* - 导入1956行用户数据
|
||||
* - 直接使用清洗2.0中的预生成标签(第17-31列)
|
||||
* - 创建16个标签分类
|
||||
*
|
||||
* 用法: node scripts/import-clean-data-v2.js
|
||||
*/
|
||||
|
||||
const ExcelJS = require('exceljs');
|
||||
const path = require('path');
|
||||
const { getDb, initializeDatabase } = require('../db/init');
|
||||
|
||||
const EXCEL_FILE = path.join(__dirname, '../清洗2.0.xlsx');
|
||||
|
||||
// ════════════════════════════════════════════════════════════════════════════
|
||||
// 标签分类定义 - 16个分类
|
||||
// ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const TAG_CATEGORIES = [
|
||||
{
|
||||
key: 'basic_info_role',
|
||||
name: '家庭角色',
|
||||
color: '#d97706'
|
||||
},
|
||||
{
|
||||
key: 'user_age_group',
|
||||
name: '用户年龄段标签',
|
||||
color: '#6366f1'
|
||||
},
|
||||
{
|
||||
key: 'child_grade',
|
||||
name: '孩子学段标签',
|
||||
color: '#8b5cf6'
|
||||
},
|
||||
{
|
||||
key: 'family_structure',
|
||||
name: '家庭结构标签',
|
||||
color: '#a78bfa'
|
||||
},
|
||||
{
|
||||
key: 'education_risk',
|
||||
name: '教育风险标签',
|
||||
color: '#c084fc'
|
||||
},
|
||||
{
|
||||
key: 'family_support',
|
||||
name: '家庭支持度标签',
|
||||
color: '#ec4899'
|
||||
},
|
||||
{
|
||||
key: 'payment_ability',
|
||||
name: '付费能力标签',
|
||||
color: '#f472b6'
|
||||
},
|
||||
{
|
||||
key: 'urgency',
|
||||
name: '需求紧迫度标签',
|
||||
color: '#f97316'
|
||||
},
|
||||
{
|
||||
key: 'core_problem',
|
||||
name: '核心问题标签',
|
||||
color: '#06b6d4'
|
||||
},
|
||||
{
|
||||
key: 'intervention_difficulty',
|
||||
name: '干预难度标签',
|
||||
color: '#0891b2'
|
||||
},
|
||||
{
|
||||
key: 'conversion_priority',
|
||||
name: '转化优先级标签',
|
||||
color: '#10b981'
|
||||
},
|
||||
{
|
||||
key: 'channel_adaption',
|
||||
name: '渠道适配标签',
|
||||
color: '#059669'
|
||||
},
|
||||
{
|
||||
key: 'product_match',
|
||||
name: '产品匹配标签',
|
||||
color: '#f59e0b'
|
||||
},
|
||||
{
|
||||
key: 'basic_info_education',
|
||||
name: '文化程度',
|
||||
color: '#dc2626'
|
||||
},
|
||||
{
|
||||
key: 'service_duration',
|
||||
name: '服务周期标签',
|
||||
color: '#7c3aed'
|
||||
}
|
||||
];
|
||||
|
||||
// ════════════════════════════════════════════════════════════════════════════
|
||||
// 列数据映射(清洗2.0.xlsx)
|
||||
// ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const COLUMN_MAPPING = {
|
||||
// 基础数据(列1-16)
|
||||
family_role: 1, // 家庭角色
|
||||
education: 2, // 文化程度
|
||||
profession: 3, // 职业
|
||||
age: 4, // 年龄
|
||||
family_role_2: 5, // 家庭角色_2
|
||||
child_gender: 6, // 性别
|
||||
child_grade: 7, // 年级
|
||||
academic_score: 8, // 学习成绩
|
||||
family_situation: 9, // 家庭基本情况
|
||||
parent_child_rel: 10, // 亲子关系
|
||||
education_divergence: 11, // 家长有无教育分歧
|
||||
negate_child: 12, // 是否经常否定孩子
|
||||
physical_punishment: 13, // 有无打骂教育
|
||||
child_with_parents: 14, // 孩子是否在父母身边长大
|
||||
caregivers: 15, // 还有谁参与孩子的养育
|
||||
child_situation: 16, // 孩子目前情况的描述
|
||||
|
||||
// 预生成标签(列17-31)
|
||||
service_days: 17, // 天数(不是标签,是数值)
|
||||
user_identity: 18, // 用户身份标签
|
||||
user_age: 19, // 用户年龄段标签
|
||||
child_grade_tag: 20, // 孩子学段标签
|
||||
family_struct_tag: 21, // 家庭结构标签
|
||||
education_risk: 22, // 教育风险标签
|
||||
family_support: 23, // 家庭支持度标签
|
||||
payment_ability: 24, // 付费能力标签
|
||||
urgency: 25, // 需求紧迫度标签
|
||||
core_problem: 26, // 核心问题标签
|
||||
intervention_diff: 27, // 干预难度标签
|
||||
conversion_priority: 28, // 转化优先级标签
|
||||
channel_adaption: 29, // 渠道适配标签
|
||||
product_match: 30, // 产品匹配标签
|
||||
service_duration: 31 // 服务周期标签
|
||||
};
|
||||
|
||||
// ════════════════════════════════════════════════════════════════════════════
|
||||
// 主程序
|
||||
// ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
async function main() {
|
||||
console.log('\n');
|
||||
console.log('╔════════════════════════════════════════════════════════════════╗');
|
||||
console.log('║ 📥 清洗2.0.xlsx 数据导入程序 v4.0 ║');
|
||||
console.log('╚════════════════════════════════════════════════════════════════╝');
|
||||
console.log('');
|
||||
|
||||
try {
|
||||
// 初始化数据库
|
||||
console.log('🔧 初始化数据库...');
|
||||
initializeDatabase();
|
||||
const db = getDb('onion');
|
||||
|
||||
// 清除旧数据
|
||||
console.log('🗑️ 清除旧数据...');
|
||||
db.prepare('DELETE FROM user_tags').run();
|
||||
db.prepare('DELETE FROM users').run();
|
||||
db.prepare('DELETE FROM tags').run();
|
||||
db.prepare('DELETE FROM tag_categories').run();
|
||||
|
||||
// 创建分类
|
||||
console.log('📂 创建标签分类...');
|
||||
const insertCategoryStmt = db.prepare(`
|
||||
INSERT INTO tag_categories (key, name, color, sort_order)
|
||||
VALUES (?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
const categoryMap = {};
|
||||
TAG_CATEGORIES.forEach((cat, idx) => {
|
||||
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
|
||||
categoryMap[cat.key] = result.lastInsertRowid;
|
||||
});
|
||||
|
||||
console.log(` ✅ 创建 ${TAG_CATEGORIES.length} 个分类\n`);
|
||||
|
||||
// 读取Excel文件
|
||||
console.log('📖 读取Excel文件...');
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
await workbook.xlsx.readFile(EXCEL_FILE);
|
||||
const worksheet = workbook.worksheets[0];
|
||||
|
||||
console.log(` • 工作表: ${worksheet.name}`);
|
||||
console.log(` • 行数: ${worksheet.rowCount}`);
|
||||
console.log(` • 列数: ${worksheet.columnCount}\n`);
|
||||
|
||||
// 准备SQL语句
|
||||
const insertUserStmt = db.prepare(`
|
||||
INSERT INTO users (uid, name, extra_json)
|
||||
VALUES (?, ?, ?)
|
||||
`);
|
||||
|
||||
const insertTagStmt = db.prepare(`
|
||||
INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
|
||||
VALUES (?, ?, ?, 0, 0, 0)
|
||||
`);
|
||||
|
||||
const insertUserTagStmt = db.prepare(`
|
||||
INSERT INTO user_tags (user_id, tag_id)
|
||||
VALUES (?, ?)
|
||||
`);
|
||||
|
||||
// 标签缓存
|
||||
const tagCache = {};
|
||||
|
||||
function getOrCreateTag(catKey, tagName) {
|
||||
if (!tagName || String(tagName).trim() === '') return null;
|
||||
|
||||
const normalizedName = String(tagName).trim();
|
||||
const cacheKey = `${catKey}:${normalizedName}`;
|
||||
|
||||
if (tagCache[cacheKey]) {
|
||||
return tagCache[cacheKey];
|
||||
}
|
||||
|
||||
// 使用name-based lookup
|
||||
let tag = db.prepare(`
|
||||
SELECT id FROM tags WHERE category_id = ? AND name = ?
|
||||
`).get(categoryMap[catKey], normalizedName);
|
||||
|
||||
if (!tag) {
|
||||
const result = insertTagStmt.run(
|
||||
`${catKey}_${Math.random().toString(36).slice(2)}`,
|
||||
normalizedName,
|
||||
categoryMap[catKey]
|
||||
);
|
||||
tag = { id: result.lastInsertRowid };
|
||||
}
|
||||
|
||||
tagCache[cacheKey] = tag.id;
|
||||
return tag.id;
|
||||
}
|
||||
|
||||
// 导入数据
|
||||
console.log('📝 导入用户数据...\n');
|
||||
let insertedCount = 0;
|
||||
let rowCount = 0;
|
||||
|
||||
worksheet.eachRow((row, rowNumber) => {
|
||||
if (rowNumber === 1) return; // 跳过标题行
|
||||
|
||||
rowCount++;
|
||||
const values = row.values;
|
||||
|
||||
if (!values[COLUMN_MAPPING.family_role]) {
|
||||
if (rowCount <= 5) {
|
||||
console.warn(`⚠️ 行 ${rowNumber} 缺少家庭角色,跳过`);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// 创建用户
|
||||
const uid = `user_${rowCount}`;
|
||||
const extraData = {
|
||||
row: rowNumber,
|
||||
days: values[COLUMN_MAPPING.service_days] || 0
|
||||
};
|
||||
|
||||
const result = insertUserStmt.run(uid, uid, JSON.stringify(extraData));
|
||||
|
||||
if (result.changes > 0) {
|
||||
insertedCount++;
|
||||
const userId = result.lastInsertRowid;
|
||||
|
||||
// 添加标签:基础信息
|
||||
const role = values[COLUMN_MAPPING.family_role];
|
||||
if (role) {
|
||||
const tagId = getOrCreateTag('basic_info_role', role);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
const education = values[COLUMN_MAPPING.education];
|
||||
if (education) {
|
||||
const tagId = getOrCreateTag('basic_info_education', education);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 添加标签:预生成标签列(列18-31)
|
||||
const tagColumns = [
|
||||
['user_identity', COLUMN_MAPPING.user_identity],
|
||||
['user_age_group', COLUMN_MAPPING.user_age],
|
||||
['child_grade', COLUMN_MAPPING.child_grade_tag],
|
||||
['family_structure', COLUMN_MAPPING.family_struct_tag],
|
||||
['education_risk', COLUMN_MAPPING.education_risk],
|
||||
['family_support', COLUMN_MAPPING.family_support],
|
||||
['payment_ability', COLUMN_MAPPING.payment_ability],
|
||||
['urgency', COLUMN_MAPPING.urgency],
|
||||
['core_problem', COLUMN_MAPPING.core_problem],
|
||||
['intervention_difficulty', COLUMN_MAPPING.intervention_diff],
|
||||
['conversion_priority', COLUMN_MAPPING.conversion_priority],
|
||||
['channel_adaption', COLUMN_MAPPING.channel_adaption],
|
||||
['product_match', COLUMN_MAPPING.product_match],
|
||||
['service_duration', COLUMN_MAPPING.service_duration]
|
||||
];
|
||||
|
||||
tagColumns.forEach(([catKey, colIdx]) => {
|
||||
const tagValue = values[colIdx];
|
||||
if (tagValue && String(tagValue).trim() !== '') {
|
||||
const tagId = getOrCreateTag(catKey, tagValue);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
});
|
||||
|
||||
if (rowCount % 100 === 0) {
|
||||
console.log(` ✓ 已处理 ${rowCount} 行...`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`\n✅ 用户导入完成:${insertedCount} 条\n`);
|
||||
|
||||
// 更新标签统计
|
||||
console.log('🔄 更新标签统计...');
|
||||
updateTagStats(db);
|
||||
|
||||
// 显示统计
|
||||
console.log('\n📊 数据统计:');
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM users) as total_users,
|
||||
(SELECT COUNT(*) FROM tags) as total_tags,
|
||||
(SELECT COUNT(*) FROM tag_categories) as total_categories,
|
||||
(SELECT COUNT(*) FROM user_tags) as total_relationships
|
||||
`).get();
|
||||
|
||||
console.log(` • 总用户: ${stats.total_users}`);
|
||||
console.log(` • 总标签: ${stats.total_tags}`);
|
||||
console.log(` • 分类数: ${stats.total_categories}`);
|
||||
console.log(` • 用户-标签关系: ${stats.total_relationships}`);
|
||||
|
||||
// 显示分类统计
|
||||
console.log('\n分类覆盖统计:');
|
||||
const catStats = db.prepare(`
|
||||
SELECT tc.name, COUNT(t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
|
||||
FROM tag_categories tc
|
||||
LEFT JOIN tags t ON tc.id = t.category_id
|
||||
LEFT JOIN user_tags ut ON t.id = ut.tag_id
|
||||
GROUP BY tc.id
|
||||
ORDER BY tc.id
|
||||
`).all();
|
||||
|
||||
catStats.forEach(stat => {
|
||||
const coverage = stats.total_users > 0 ? ((stat.user_count || 0) * 100 / stats.total_users).toFixed(1) : 0;
|
||||
console.log(` • ${stat.name}: ${stat.tag_count || 0} 标签, ${stat.user_count || 0} 用户 (${coverage}%)`);
|
||||
});
|
||||
|
||||
db.close();
|
||||
|
||||
console.log('\n🎉 导入流程完成!\n');
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ 导入失败:', error.message);
|
||||
console.error(error.stack);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
function updateTagStats(db) {
|
||||
const updateStmt = db.prepare(`
|
||||
UPDATE tags
|
||||
SET
|
||||
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
|
||||
coverage_rate = ROUND(
|
||||
(SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 /
|
||||
(SELECT COUNT(*) FROM users),
|
||||
2
|
||||
)
|
||||
WHERE id = ?
|
||||
`);
|
||||
|
||||
const allTags = db.prepare('SELECT id FROM tags').all();
|
||||
allTags.forEach(tag => {
|
||||
updateStmt.run(tag.id);
|
||||
});
|
||||
}
|
||||
|
||||
// 执行主程序
|
||||
main();
|
||||
448
scripts/import-clean-data-v3.js
Normal file
448
scripts/import-clean-data-v3.js
Normal file
@@ -0,0 +1,448 @@
|
||||
/**
|
||||
* 清洗3.0 导入脚本 v1.0
|
||||
*
|
||||
* 业务约束:
|
||||
* 1) 参加指导最想解决 缺失时采用保守推断,标签后缀“(推断)”
|
||||
* 2) 监护人2相关字段不参与建模
|
||||
* 3) 删除付费能力标签分类
|
||||
* 4) 全量替换导入
|
||||
*/
|
||||
|
||||
const ExcelJS = require('exceljs');
|
||||
const path = require('path');
|
||||
const { getDb, initializeDatabase } = require('../db/init');
|
||||
|
||||
const EXCEL_FILE = path.join(__dirname, '../清洗3.0.xlsx');
|
||||
const DB_THEME = 'onion';
|
||||
const TOTAL_USERS_FALLBACK = 11500;
|
||||
|
||||
const TAG_CATEGORIES = [
|
||||
{ key: 'basic_info_role', name: '家庭角色', color: '#d97706' },
|
||||
{ key: 'user_age_group', name: '用户年龄段标签', color: '#6366f1' },
|
||||
{ key: 'child_grade', name: '孩子学段标签', color: '#8b5cf6' },
|
||||
{ key: 'family_structure', name: '家庭结构标签', color: '#a78bfa' },
|
||||
{ key: 'education_risk', name: '教育风险标签', color: '#c084fc' },
|
||||
{ key: 'family_support', name: '家庭支持度标签', color: '#ec4899' },
|
||||
{ key: 'urgency', name: '需求紧迫度标签', color: '#f97316' },
|
||||
{ key: 'core_problem', name: '核心问题标签', color: '#06b6d4' },
|
||||
{ key: 'intervention_difficulty', name: '干预难度标签', color: '#0891b2' },
|
||||
{ key: 'conversion_priority', name: '转化优先级标签', color: '#10b981' },
|
||||
{ key: 'channel_adaption', name: '渠道适配标签', color: '#059669' },
|
||||
{ key: 'product_match', name: '产品匹配标签', color: '#f59e0b' },
|
||||
{ key: 'basic_info_education', name: '文化程度', color: '#dc2626' },
|
||||
{ key: 'service_duration', name: '服务周期标签', color: '#7c3aed' }
|
||||
];
|
||||
|
||||
function text(v) {
|
||||
if (v === undefined || v === null) return '';
|
||||
return String(v).replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
function parseNumber(v) {
|
||||
if (v === undefined || v === null || v === '') return null;
|
||||
const raw = String(v).replace(/[^\d.\-]/g, '');
|
||||
if (!raw) return null;
|
||||
const n = Number(raw);
|
||||
return Number.isFinite(n) ? n : null;
|
||||
}
|
||||
|
||||
function splitMulti(v) {
|
||||
const s = text(v);
|
||||
if (!s) return [];
|
||||
return s
|
||||
.split(/[、,,;;/|]+/)
|
||||
.map((item) => item.trim())
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function normalizeFamilyAtmosphere(v) {
|
||||
const s = text(v);
|
||||
if (!s) return '中立';
|
||||
|
||||
const warm = ['和谐', '温暖', '支持', '理解', '亲密', '关心', '融洽', '良好'];
|
||||
const cold = ['冷漠', '疏离', '冷战', '忽视', '回避', '压抑', '隔阂'];
|
||||
const conflict = ['争吵', '冲突', '矛盾', '紧张', '对立', '不和'];
|
||||
const neutral = ['一般', '普通', '还行', '尚可', '平常'];
|
||||
|
||||
const hit = (dict) => dict.some((k) => s.includes(k));
|
||||
|
||||
if (hit(cold) || hit(conflict)) return '冷漠';
|
||||
if (hit(warm)) return '温暖';
|
||||
if (hit(neutral)) return '中立';
|
||||
return '中立';
|
||||
}
|
||||
|
||||
function normalizeParentChild(v) {
|
||||
const s = text(v);
|
||||
if (!s) return '中立';
|
||||
if (/(紧张|疏离|冲突|差|糟)/.test(s)) return '紧张';
|
||||
if (/(良好|亲密|和谐|较好|很好)/.test(s)) return '良好';
|
||||
return '中立';
|
||||
}
|
||||
|
||||
function normalizeRole(v) {
|
||||
const s = text(v);
|
||||
if (!s) return '';
|
||||
if (/(妈妈|母亲|妈咪)/.test(s)) return '妈妈';
|
||||
if (/(爸爸|父亲)/.test(s)) return '父亲';
|
||||
if (/(奶奶|祖母)/.test(s)) return '奶奶';
|
||||
if (/(爷爷|祖父)/.test(s)) return '爷爷';
|
||||
if (/(姥姥|外婆)/.test(s)) return '姥姥/外婆';
|
||||
return s;
|
||||
}
|
||||
|
||||
function ageToTag(age) {
|
||||
if (age == null) return '';
|
||||
if (age < 25) return '25岁以下';
|
||||
if (age < 35) return '25-34岁';
|
||||
if (age < 45) return '35-44岁';
|
||||
if (age < 55) return '45-54岁';
|
||||
return '55岁及以上';
|
||||
}
|
||||
|
||||
function normalizeGrade(v) {
|
||||
const s = text(v);
|
||||
if (!s) return '';
|
||||
if (/幼/.test(s)) return '幼儿园';
|
||||
if (/(小|一年级|二年级|三年级|四年级|五年级|六年级)/.test(s)) return '小学';
|
||||
if (/(初一|初二|初三|初中)/.test(s)) return '初中';
|
||||
if (/(高一|高二|高三|高中)/.test(s)) return '高中';
|
||||
if (/(大学|大一|大二|大三|大四)/.test(s)) return '大学';
|
||||
return s;
|
||||
}
|
||||
|
||||
function normalizeScore(v) {
|
||||
const s = text(v);
|
||||
if (!s) return '一般';
|
||||
if (/(优秀|优异|很好|拔尖)/.test(s)) return '优秀';
|
||||
if (/(良好|较好|不错)/.test(s)) return '良好';
|
||||
if (/(差|不理想|偏下|落后|薄弱)/.test(s)) return '较差';
|
||||
return '一般';
|
||||
}
|
||||
|
||||
function inferCoreProblem(row) {
|
||||
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
|
||||
const atmosphere = normalizeFamilyAtmosphere(row['家庭氛围']);
|
||||
const relation = normalizeParentChild(row['亲子关系']);
|
||||
const divergence = text(row['家长有无教育分歧']);
|
||||
const negate = text(row['是否经常否定孩子']);
|
||||
const physical = text(row['有无打骂教育']);
|
||||
const majorEvent = text(row['重大影响事件_扩展']);
|
||||
|
||||
if (score === '较差') return '学习动力与执行(推断)';
|
||||
if (/(有|是|存在|经常)/.test(negate) || /(有|是|存在|经常)/.test(physical)) {
|
||||
return '教养方式调整(推断)';
|
||||
}
|
||||
if (atmosphere === '冷漠' || relation === '紧张' || /(有|是|分歧)/.test(divergence)) {
|
||||
return '亲子沟通修复(推断)';
|
||||
}
|
||||
if (/(离异|变故|创伤|重大)/.test(majorEvent)) {
|
||||
return '情绪与安全感支持(推断)';
|
||||
}
|
||||
return '阶段性成长支持(推断)';
|
||||
}
|
||||
|
||||
function inferEducationRisk(row) {
|
||||
const risk = [];
|
||||
const divergence = text(row['家长有无教育分歧']);
|
||||
const negate = text(row['是否经常否定孩子']);
|
||||
const physical = text(row['有无打骂教育']);
|
||||
const withParents = text(row['孩子是否在父母身边长大']);
|
||||
|
||||
if (/(有|是|分歧|不一致)/.test(divergence)) risk.push('教育理念分歧');
|
||||
if (/(有|是|经常|总是)/.test(negate)) risk.push('否定式沟通风险');
|
||||
if (/(有|是|打|骂|体罚)/.test(physical)) risk.push('惩罚式教育风险');
|
||||
if (/(否|不在|老人|寄养|留守)/.test(withParents)) risk.push('陪伴不足风险');
|
||||
|
||||
return risk;
|
||||
}
|
||||
|
||||
function inferFamilyStructure(row) {
|
||||
const tags = [];
|
||||
const basic = text(row['家庭基本情况_规范'] || row['家庭基本情况']);
|
||||
const withParents = text(row['孩子是否在父母身边长大']);
|
||||
const caregivers = text(row['还有谁参与孩子的养育']);
|
||||
|
||||
if (/单亲|离异/.test(basic)) tags.push('单亲家庭');
|
||||
if (/重组/.test(basic)) tags.push('重组家庭');
|
||||
if (/三代同堂|隔代|祖/.test(basic) || /爷爷|奶奶|姥姥|外婆|祖/.test(caregivers)) tags.push('隔代参与家庭');
|
||||
if (/(否|不在|寄养|留守)/.test(withParents)) tags.push('分离养育家庭');
|
||||
if (!tags.length) tags.push('常规家庭结构');
|
||||
|
||||
return tags;
|
||||
}
|
||||
|
||||
function inferUrgency(row) {
|
||||
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
|
||||
const relation = normalizeParentChild(row['亲子关系']);
|
||||
const physical = text(row['有无打骂教育']);
|
||||
|
||||
if (score === '较差' || relation === '紧张' || /(有|是|打|骂)/.test(physical)) return '高紧迫度';
|
||||
if (score === '一般') return '中紧迫度';
|
||||
return '低紧迫度';
|
||||
}
|
||||
|
||||
function inferInterventionDifficulty(row) {
|
||||
let score = 0;
|
||||
const relation = normalizeParentChild(row['亲子关系']);
|
||||
const divergence = text(row['家长有无教育分歧']);
|
||||
const negate = text(row['是否经常否定孩子']);
|
||||
const physical = text(row['有无打骂教育']);
|
||||
|
||||
if (relation === '紧张') score += 2;
|
||||
if (/(有|是|分歧)/.test(divergence)) score += 1;
|
||||
if (/(有|是|经常)/.test(negate)) score += 1;
|
||||
if (/(有|是|打|骂)/.test(physical)) score += 2;
|
||||
|
||||
if (score >= 4) return '高干预难度';
|
||||
if (score >= 2) return '中干预难度';
|
||||
return '低干预难度';
|
||||
}
|
||||
|
||||
function inferConversionPriority(row) {
|
||||
const urgency = inferUrgency(row);
|
||||
const diff = inferInterventionDifficulty(row);
|
||||
if (urgency === '高紧迫度' && diff !== '高干预难度') return '高优先级';
|
||||
if (urgency === '高紧迫度' || diff === '中干预难度') return '中优先级';
|
||||
return '低优先级';
|
||||
}
|
||||
|
||||
function inferChannelAdaption(row) {
|
||||
const q = text(row['问卷评估']);
|
||||
if (!q) return '标准沟通';
|
||||
if (/(线上|微信|视频)/.test(q)) return '线上沟通优先';
|
||||
if (/(线下|到访|面谈)/.test(q)) return '线下面谈优先';
|
||||
return '标准沟通';
|
||||
}
|
||||
|
||||
function inferProductMatch(row) {
|
||||
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
|
||||
const relation = normalizeParentChild(row['亲子关系']);
|
||||
if (score === '较差' && relation === '紧张') return '综合干预方案';
|
||||
if (score === '较差') return '学习提升方案';
|
||||
if (relation === '紧张') return '亲子沟通方案';
|
||||
return '成长支持方案';
|
||||
}
|
||||
|
||||
function inferServiceDuration(row) {
|
||||
const urgency = inferUrgency(row);
|
||||
const difficulty = inferInterventionDifficulty(row);
|
||||
if (urgency === '高紧迫度' || difficulty === '高干预难度') return '12周';
|
||||
if (urgency === '中紧迫度') return '8周';
|
||||
return '4周';
|
||||
}
|
||||
|
||||
function updateTagStats(db) {
|
||||
const totalUsers = db.prepare('SELECT COUNT(*) as n FROM users').get().n || TOTAL_USERS_FALLBACK;
|
||||
const allTags = db.prepare('SELECT id FROM tags').all();
|
||||
const stmt = db.prepare(`
|
||||
UPDATE tags SET
|
||||
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
|
||||
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
|
||||
WHERE id = ?
|
||||
`);
|
||||
|
||||
allTags.forEach((t) => stmt.run(totalUsers, t.id));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('\n🚀 清洗3.0 导入流程 v1.0\n');
|
||||
|
||||
initializeDatabase(DB_THEME);
|
||||
const db = getDb(DB_THEME);
|
||||
|
||||
try {
|
||||
db.pragma('foreign_keys = OFF');
|
||||
|
||||
console.log('🗑️ 清空旧数据...');
|
||||
db.prepare('DELETE FROM user_tags').run();
|
||||
db.prepare('DELETE FROM users').run();
|
||||
db.prepare('DELETE FROM tags').run();
|
||||
db.prepare('DELETE FROM tag_categories').run();
|
||||
|
||||
const categoryMap = {};
|
||||
const insertCategoryStmt = db.prepare(`
|
||||
INSERT INTO tag_categories (key, name, color, sort_order)
|
||||
VALUES (?, ?, ?, ?)
|
||||
`);
|
||||
|
||||
TAG_CATEGORIES.forEach((cat, idx) => {
|
||||
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
|
||||
categoryMap[cat.key] = result.lastInsertRowid;
|
||||
});
|
||||
|
||||
console.log(`✅ 已创建 ${TAG_CATEGORIES.length} 个分类(已删除付费能力)`);
|
||||
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
await workbook.xlsx.readFile(EXCEL_FILE);
|
||||
const worksheet = workbook.worksheets[0];
|
||||
|
||||
console.log(`📖 读取 ${worksheet.name} | 行: ${worksheet.rowCount} | 列: ${worksheet.columnCount}`);
|
||||
|
||||
const headerRow = worksheet.getRow(1);
|
||||
const headers = {};
|
||||
headerRow.eachCell((cell, colNumber) => {
|
||||
headers[text(cell.value)] = colNumber;
|
||||
});
|
||||
|
||||
const needHeaders = [
|
||||
'家庭角色', '文化程度', '年龄_数值', '年龄_2_数值', '年级_规范', '学习成绩_规范',
|
||||
'家庭基本情况_规范', '家庭氛围', '亲子关系', '家长有无教育分歧', '是否经常否定孩子',
|
||||
'有无打骂教育', '孩子是否在父母身边长大', '还有谁参与孩子的养育',
|
||||
'重大影响事件_扩展', '参加指导最想解决_扩展', '问卷评估', '文件名称'
|
||||
];
|
||||
|
||||
const missing = needHeaders.filter((h) => !headers[h]);
|
||||
if (missing.length) {
|
||||
throw new Error(`缺少关键表头: ${missing.join(', ')}`);
|
||||
}
|
||||
|
||||
const insertUserStmt = db.prepare('INSERT INTO users (uid, name, extra_json) VALUES (?, ?, ?)');
|
||||
const insertTagStmt = db.prepare('INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order) VALUES (?, ?, ?, 0, 0, 0)');
|
||||
const insertUserTagStmt = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
|
||||
|
||||
const tagCache = new Map();
|
||||
|
||||
function getOrCreateTag(catKey, tagName) {
|
||||
const n = text(tagName);
|
||||
if (!n) return null;
|
||||
const cacheKey = `${catKey}:${n}`;
|
||||
if (tagCache.has(cacheKey)) return tagCache.get(cacheKey);
|
||||
|
||||
let tag = db.prepare('SELECT id FROM tags WHERE category_id = ? AND name = ?').get(categoryMap[catKey], n);
|
||||
if (!tag) {
|
||||
const key = `${catKey}_${Math.random().toString(36).slice(2, 10)}`;
|
||||
const result = insertTagStmt.run(key, n, categoryMap[catKey]);
|
||||
tag = { id: result.lastInsertRowid };
|
||||
}
|
||||
tagCache.set(cacheKey, tag.id);
|
||||
return tag.id;
|
||||
}
|
||||
|
||||
let rowCount = 0;
|
||||
let inserted = 0;
|
||||
let inferredCoreCount = 0;
|
||||
|
||||
worksheet.eachRow((row, rowNumber) => {
|
||||
if (rowNumber === 1) return;
|
||||
rowCount += 1;
|
||||
|
||||
const rowObj = {};
|
||||
for (const [name, idx] of Object.entries(headers)) {
|
||||
rowObj[name] = row.getCell(idx).value;
|
||||
}
|
||||
|
||||
const role = normalizeRole(rowObj['家庭角色']);
|
||||
if (!role) return;
|
||||
|
||||
const fileName = text(rowObj['文件名称']);
|
||||
const safeFileName = fileName.replace(/\s+/g, '_').slice(0, 60);
|
||||
const uid = fileName ? `u_${safeFileName}_${rowNumber}` : `u_row_${rowNumber}`;
|
||||
|
||||
const userExtra = {
|
||||
rowNumber,
|
||||
inferredCore: false,
|
||||
source: 'clean3.0'
|
||||
};
|
||||
|
||||
const result = insertUserStmt.run(uid, uid, JSON.stringify(userExtra));
|
||||
if (!result.changes) return;
|
||||
|
||||
inserted += 1;
|
||||
const userId = result.lastInsertRowid;
|
||||
|
||||
const addTag = (catKey, tagName) => {
|
||||
const tagId = getOrCreateTag(catKey, tagName);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
};
|
||||
|
||||
// 基础标签
|
||||
addTag('basic_info_role', role);
|
||||
addTag('basic_info_education', text(rowObj['文化程度']));
|
||||
|
||||
// 年龄段(监护人1 + 监护人2数值年龄合并,但不使用监护人2其他字段)
|
||||
const age1 = parseNumber(rowObj['年龄_数值']);
|
||||
const age2 = parseNumber(rowObj['年龄_2_数值']);
|
||||
addTag('user_age_group', ageToTag(age1));
|
||||
addTag('user_age_group', ageToTag(age2));
|
||||
|
||||
// 学段
|
||||
addTag('child_grade', normalizeGrade(rowObj['年级_规范']));
|
||||
|
||||
// 家庭结构
|
||||
inferFamilyStructure(rowObj).forEach((t) => addTag('family_structure', t));
|
||||
|
||||
// 教育风险
|
||||
inferEducationRisk(rowObj).forEach((t) => addTag('education_risk', t));
|
||||
|
||||
// 家庭支持度(3类氛围 + 亲子关系)
|
||||
addTag('family_support', `家庭氛围-${normalizeFamilyAtmosphere(rowObj['家庭氛围'])}`);
|
||||
addTag('family_support', `亲子关系-${normalizeParentChild(rowObj['亲子关系'])}`);
|
||||
|
||||
// 紧迫度、难度、优先级
|
||||
addTag('urgency', inferUrgency(rowObj));
|
||||
addTag('intervention_difficulty', inferInterventionDifficulty(rowObj));
|
||||
addTag('conversion_priority', inferConversionPriority(rowObj));
|
||||
|
||||
// 渠道/产品/周期
|
||||
addTag('channel_adaption', inferChannelAdaption(rowObj));
|
||||
addTag('product_match', inferProductMatch(rowObj));
|
||||
addTag('service_duration', inferServiceDuration(rowObj));
|
||||
|
||||
// 核心问题:优先原始扩展,否则保守推断 + (推断)
|
||||
const originCore = splitMulti(rowObj['参加指导最想解决_扩展']);
|
||||
if (originCore.length) {
|
||||
originCore.forEach((tag) => addTag('core_problem', tag));
|
||||
} else {
|
||||
const inferred = inferCoreProblem(rowObj);
|
||||
addTag('core_problem', inferred);
|
||||
inferredCoreCount += 1;
|
||||
}
|
||||
|
||||
if (rowCount % 500 === 0) {
|
||||
console.log(` ✓ 已处理 ${rowCount} 行`);
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`\n✅ 导入用户: ${inserted}`);
|
||||
console.log(`✅ 核心问题推断数: ${inferredCoreCount}`);
|
||||
|
||||
updateTagStats(db);
|
||||
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM users) as total_users,
|
||||
(SELECT COUNT(*) FROM tags) as total_tags,
|
||||
(SELECT COUNT(*) FROM tag_categories) as total_categories,
|
||||
(SELECT COUNT(*) FROM user_tags) as total_rels
|
||||
`).get();
|
||||
|
||||
console.log('\n📊 结果统计');
|
||||
console.log(` • 用户数: ${stats.total_users}`);
|
||||
console.log(` • 标签数: ${stats.total_tags}`);
|
||||
console.log(` • 分类数: ${stats.total_categories}`);
|
||||
console.log(` • 关系数: ${stats.total_rels}`);
|
||||
|
||||
const deletedPayment = db.prepare('SELECT COUNT(*) as n FROM tag_categories WHERE key = ?').get('payment_ability').n;
|
||||
console.log(` • 付费能力分类存在数: ${deletedPayment}`);
|
||||
|
||||
const inferredTags = db.prepare(`
|
||||
SELECT COUNT(*) as n FROM tags t
|
||||
JOIN tag_categories c ON c.id = t.category_id
|
||||
WHERE c.key = 'core_problem' AND t.name LIKE '%(推断)'
|
||||
`).get().n;
|
||||
console.log(` • 推断核心问题标签种类: ${inferredTags}`);
|
||||
|
||||
db.pragma('foreign_keys = ON');
|
||||
db.close();
|
||||
|
||||
console.log('\n🎉 清洗3.0导入完成\n');
|
||||
} catch (error) {
|
||||
console.error('❌ 导入失败:', error.message);
|
||||
console.error(error.stack);
|
||||
try { db.close(); } catch (_) {}
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
673
scripts/import-clean-data.js
Normal file
673
scripts/import-clean-data.js
Normal file
@@ -0,0 +1,673 @@
|
||||
/**
|
||||
* 新数据导入脚本 v3.0
|
||||
* 基于"清洗1.0.xlsx"的完整标签体系
|
||||
*
|
||||
* 标签体系:49个标签,分为5个维度
|
||||
* 用法: node scripts/import-clean-data.js
|
||||
*/
|
||||
|
||||
const ExcelJS = require('exceljs');
|
||||
const path = require('path');
|
||||
const { getDb, initializeDatabase } = require('../db/init');
|
||||
|
||||
const EXCEL_FILE = path.join(__dirname, '../清洗1.0.xlsx');
|
||||
|
||||
// ════════════════════════════════════════════════════════════════════════════
|
||||
// 标签分类定义 v3.0 - 49个标签 5个维度
|
||||
// ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
const TAG_CATEGORIES = [
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
// 第一维度:监护人信息 (19个标签)
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
{
|
||||
key: 'guardian_role',
|
||||
name: '监护人身份',
|
||||
color: '#3b82f6',
|
||||
columns: [1], // A: 家庭角色
|
||||
type: 'discrete'
|
||||
},
|
||||
{
|
||||
key: 'guardian_education',
|
||||
name: '文化程度',
|
||||
color: '#6366f1',
|
||||
columns: [2], // B: 文化程度
|
||||
type: 'discrete'
|
||||
},
|
||||
{
|
||||
key: 'guardian_occupation',
|
||||
name: '职业与经济地位',
|
||||
color: '#8b5cf6',
|
||||
columns: [3], // C: 职业
|
||||
type: 'discrete'
|
||||
},
|
||||
{
|
||||
key: 'guardian_age_group',
|
||||
name: '监护人年龄段',
|
||||
color: '#a78bfa',
|
||||
columns: [4], // D: 年龄
|
||||
type: 'continuous'
|
||||
},
|
||||
{
|
||||
key: 'second_guardian_role',
|
||||
name: '第二监护人身份',
|
||||
color: '#c084fc',
|
||||
columns: [5], // E: 家庭角色_2
|
||||
type: 'discrete'
|
||||
},
|
||||
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
// 第二维度:孩子信息 (13个标签)
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
{
|
||||
key: 'child_gender',
|
||||
name: '孩子性别',
|
||||
color: '#ec4899',
|
||||
columns: [6], // F: 性别
|
||||
type: 'discrete'
|
||||
},
|
||||
{
|
||||
key: 'child_grade',
|
||||
name: '孩子学段',
|
||||
color: '#f472b6',
|
||||
columns: [7], // G: 年级
|
||||
type: 'discrete'
|
||||
},
|
||||
{
|
||||
key: 'child_academic_score',
|
||||
name: '学习成绩',
|
||||
color: '#f97316',
|
||||
columns: [8], // H: 学习成绩
|
||||
type: 'discrete'
|
||||
},
|
||||
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
// 第三维度:家庭环境 (8个标签)
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
{
|
||||
key: 'family_structure',
|
||||
name: '家庭结构',
|
||||
color: '#06b6d4',
|
||||
columns: [9], // I: 家庭基本情况
|
||||
type: 'keyword_extract',
|
||||
keywords: ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家']
|
||||
},
|
||||
{
|
||||
key: 'parent_child_relationship',
|
||||
name: '亲子关系',
|
||||
color: '#0891b2',
|
||||
columns: [10], // J: 亲子关系
|
||||
type: 'text'
|
||||
},
|
||||
{
|
||||
key: 'child_living_with_parents',
|
||||
name: '与父母同住情况',
|
||||
color: '#10b981',
|
||||
columns: [14], // N: 孩子是否在父母身边长大
|
||||
type: 'yes_no'
|
||||
},
|
||||
{
|
||||
key: 'child_caregivers',
|
||||
name: '参与养育人员',
|
||||
color: '#059669',
|
||||
columns: [15], // O: 还有谁参与孩子的养育
|
||||
type: 'text'
|
||||
},
|
||||
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
// 第四维度:教育风险 (6个标签)
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
{
|
||||
key: 'education_consensus',
|
||||
name: '教育理念一致性',
|
||||
color: '#f59e0b',
|
||||
columns: [11], // K: 家长有无教育分歧
|
||||
type: 'yes_no'
|
||||
},
|
||||
{
|
||||
key: 'child_negation',
|
||||
name: '否定孩子情况',
|
||||
color: '#d97706',
|
||||
columns: [12], // L: 是否经常否定孩子
|
||||
type: 'yes_no'
|
||||
},
|
||||
{
|
||||
key: 'physical_punishment',
|
||||
name: '打骂教育',
|
||||
color: '#dc2626',
|
||||
columns: [13], // M: 有无打骂教育
|
||||
type: 'yes_no'
|
||||
},
|
||||
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
// 第五维度:服务方案 (3个标签)
|
||||
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
{
|
||||
key: 'service_duration',
|
||||
name: '服务周期',
|
||||
color: '#7c3aed',
|
||||
columns: [17], // Q: 天数
|
||||
type: 'discrete'
|
||||
}
|
||||
];
|
||||
|
||||
// 值映射与标准化规则
|
||||
const TAG_MAPPINGS = {
|
||||
'guardian_role': {
|
||||
'母亲': '母亲',
|
||||
'妈妈': '母亲',
|
||||
'母': '母亲',
|
||||
'父亲': '父亲',
|
||||
'爸爸': '父亲',
|
||||
'奶奶': '祖母',
|
||||
'祖母': '祖母',
|
||||
'爷爷': '祖父',
|
||||
'外婆': '外祖母',
|
||||
'外公': '外祖父',
|
||||
'姥姥': '外祖母',
|
||||
'姥爷': '外祖父',
|
||||
'舅舅': '其他亲属',
|
||||
'妻子': '其他亲属',
|
||||
'大姐': '其他亲属'
|
||||
},
|
||||
'guardian_education': {
|
||||
'初小': '小学',
|
||||
'小学': '小学',
|
||||
'初中': '初中',
|
||||
'中师': '中专',
|
||||
'中专': '中专',
|
||||
'高中': '高中',
|
||||
'大专': '大专',
|
||||
'大学': '本科',
|
||||
'本科': '本科',
|
||||
'大学本科': '本科',
|
||||
'硕士': '硕士及以上',
|
||||
'研究生': '硕士及以上',
|
||||
'在职研究生': '硕士及以上'
|
||||
},
|
||||
'child_gender': {
|
||||
'男': '男孩',
|
||||
'女': '女孩',
|
||||
'女、男': '双胞胎'
|
||||
},
|
||||
'child_academic_score': {
|
||||
'优秀': '优秀',
|
||||
'良好': '良好',
|
||||
'一般': '一般',
|
||||
'差': '较差'
|
||||
},
|
||||
'child_living_with_parents': {
|
||||
'是': '是',
|
||||
'是的': '是',
|
||||
'在': '是',
|
||||
'否': '否',
|
||||
'没有': '否',
|
||||
'不是': '否'
|
||||
},
|
||||
'education_consensus': {
|
||||
'有': '有分歧',
|
||||
'是': '有分歧',
|
||||
'否': '无分歧',
|
||||
'无': '无分歧',
|
||||
'没有': '无分歧'
|
||||
},
|
||||
'child_negation': {
|
||||
'是': '是',
|
||||
'有': '是',
|
||||
'是的': '是',
|
||||
'经常': '是',
|
||||
'否': '否',
|
||||
'无': '否',
|
||||
'没有': '否',
|
||||
'偶尔': '否'
|
||||
},
|
||||
'physical_punishment': {
|
||||
'有': '有',
|
||||
'是': '有',
|
||||
'有过': '有',
|
||||
'偶尔有': '有',
|
||||
'无': '无',
|
||||
'没有': '无',
|
||||
'否': '无',
|
||||
'基本上没有': '无'
|
||||
},
|
||||
'service_duration': {
|
||||
'60天': '60天课程',
|
||||
'90天': '90天课程',
|
||||
'180天': '180天课程'
|
||||
}
|
||||
};
|
||||
|
||||
// 年龄分组
|
||||
function getAgeGroup(age) {
|
||||
if (!age || isNaN(age)) return '年龄未知';
|
||||
const ageNum = parseInt(age);
|
||||
if (ageNum < 25) return '25岁以下';
|
||||
else if (ageNum < 35) return '25-35岁';
|
||||
else if (ageNum < 45) return '35-45岁';
|
||||
else if (ageNum < 55) return '45-55岁';
|
||||
else if (ageNum < 65) return '55-65岁';
|
||||
else if (ageNum < 75) return '65-75岁';
|
||||
else return '75岁以上';
|
||||
}
|
||||
|
||||
// 学段分组
|
||||
function gradeToSegment(grade) {
|
||||
if (!grade) return '学段未知';
|
||||
const gradeStr = String(grade).toLowerCase();
|
||||
|
||||
if (gradeStr.includes('一') || gradeStr.includes('1年')) return '小学低段(1-3年级)';
|
||||
if (gradeStr.includes('二') || gradeStr.includes('2年')) return '小学低段(1-3年级)';
|
||||
if (gradeStr.includes('三') || gradeStr.includes('3年')) return '小学低段(1-3年级)';
|
||||
if (gradeStr.includes('四') || gradeStr.includes('4年')) return '小学高段(4-6年级)';
|
||||
if (gradeStr.includes('五') || gradeStr.includes('5年')) return '小学高段(4-6年级)';
|
||||
if (gradeStr.includes('六') || gradeStr.includes('6年')) return '小学高段(4-6年级)';
|
||||
if (gradeStr.includes('初一')) return '初中前期(初一初二)';
|
||||
if (gradeStr.includes('初二') || gradeStr.includes('准初')) return '初中前期(初一初二)';
|
||||
if (gradeStr.includes('初三') || gradeStr.includes('九年')) return '初中毕业班(初三)';
|
||||
if (gradeStr.includes('高一')) return '高中前期(高一高二)';
|
||||
if (gradeStr.includes('高二')) return '高中前期(高一高二)';
|
||||
if (gradeStr.includes('高三')) return '高中毕业班(高三)';
|
||||
|
||||
return '学段未知';
|
||||
}
|
||||
|
||||
// 亲子关系分类
|
||||
function relationshipQuality(text) {
|
||||
if (!text) return '未指定';
|
||||
const lowerText = String(text).toLowerCase();
|
||||
|
||||
if (lowerText.includes('良好') || lowerText.includes('好') ||
|
||||
lowerText.includes('和谐') || lowerText.includes('可以') ||
|
||||
lowerText.includes('还好') || lowerText.includes('较好') ||
|
||||
lowerText.includes('还可以')) {
|
||||
return '亲子关系良好';
|
||||
}
|
||||
|
||||
if (lowerText.includes('一般') || lowerText.includes('还行') ||
|
||||
lowerText.includes('正常') || lowerText.includes('时好时坏')) {
|
||||
return '亲子关系一般';
|
||||
}
|
||||
|
||||
if (lowerText.includes('不好') || lowerText.includes('差') ||
|
||||
lowerText.includes('紧张')) {
|
||||
return '亲子关系较差';
|
||||
}
|
||||
|
||||
return '亲子关系未评估';
|
||||
}
|
||||
|
||||
async function importCleanData() {
|
||||
try {
|
||||
console.log(`\n📂 读取 Excel 文件: ${EXCEL_FILE}`);
|
||||
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
await workbook.xlsx.readFile(EXCEL_FILE);
|
||||
|
||||
const worksheet = workbook.getWorksheet(1);
|
||||
if (!worksheet) {
|
||||
throw new Error('找不到工作表');
|
||||
}
|
||||
|
||||
console.log(`📊 总行数: ${worksheet.rowCount}`);
|
||||
|
||||
const db = getDb('onion');
|
||||
|
||||
// 初始化数据库
|
||||
initializeDatabase('onion');
|
||||
|
||||
// 创建所有标签分类
|
||||
console.log('🏗️ 建立分类体系...');
|
||||
const categoryMap = {};
|
||||
for (const cat of TAG_CATEGORIES) {
|
||||
const result = db.prepare(`
|
||||
INSERT OR IGNORE INTO tag_categories (key, name, sort_order, color)
|
||||
VALUES (?, ?, ?, ?)
|
||||
`).run(cat.key, cat.name, 0, cat.color || '#6366f1');
|
||||
|
||||
const catRecord = db.prepare(`
|
||||
SELECT id FROM tag_categories WHERE key = ?
|
||||
`).get(cat.key);
|
||||
categoryMap[cat.key] = catRecord.id;
|
||||
}
|
||||
|
||||
console.log(`✅ 创建了 ${Object.keys(categoryMap).length} 个分类`);
|
||||
|
||||
// 处理数据行
|
||||
let insertedCount = 0;
|
||||
const insertUserStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO users (uid, name, extra_json)
|
||||
VALUES (?, ?, ?)
|
||||
`);
|
||||
|
||||
const insertUserTagStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
VALUES (?, ?)
|
||||
`);
|
||||
|
||||
const tagCache = {};
|
||||
|
||||
function getOrCreateTag(catKey, tagName) {
|
||||
if (!tagName || !catKey) return null;
|
||||
|
||||
const cacheKey = `${catKey}:${tagName}`;
|
||||
if (tagCache[cacheKey]) return tagCache[cacheKey];
|
||||
|
||||
// 先尝试找系统中是否已经有这个标签
|
||||
let tag = db.prepare(`
|
||||
SELECT id FROM tags WHERE category_id = ? AND name = ?
|
||||
`).get(categoryMap[catKey], tagName);
|
||||
|
||||
if (!tag) {
|
||||
// 如果没有,生成一个唯一的key
|
||||
const tagNameNorm = String(tagName).toLowerCase().trim().replace(/\s+/g, '_');
|
||||
const hashCode = Array.from(tagNameNorm).reduce((h, c) => ((h << 5) - h) + c.charCodeAt(0), 0) & 0xffffff;
|
||||
let tagKey = `${catKey}_${hashCode.toString(16)}`;
|
||||
|
||||
// 检查key冲突
|
||||
let counter = 1;
|
||||
while (db.prepare(`SELECT 1 FROM tags WHERE key = ?`).get(tagKey)) {
|
||||
tagKey = `${catKey}_${hashCode.toString(16)}_${counter}`;
|
||||
counter++;
|
||||
}
|
||||
|
||||
db.prepare(`
|
||||
INSERT INTO tags (key, name, category_id, sort_order)
|
||||
VALUES (?, ?, ?, ?)
|
||||
`).run(tagKey, tagName, categoryMap[catKey], 0);
|
||||
|
||||
tag = db.prepare(`
|
||||
SELECT id FROM tags WHERE key = ?
|
||||
`).get(tagKey);
|
||||
}
|
||||
|
||||
tagCache[cacheKey] = tag?.id;
|
||||
return tag?.id;
|
||||
}
|
||||
|
||||
// 遍历 Excel 数据行
|
||||
let rowCount = 0;
|
||||
worksheet.eachRow((row, rowNumber) => {
|
||||
if (rowNumber === 1) return; // 跳过表头
|
||||
|
||||
rowCount++;
|
||||
const values = row.values || [];
|
||||
|
||||
// 提取基本信息
|
||||
const uid = `user_${rowNumber - 1}`; // 简单的用户ID
|
||||
const guardianRole = values[1];
|
||||
const childGrade = values[7];
|
||||
const childDesc = values[16];
|
||||
|
||||
if (!guardianRole) {
|
||||
console.warn(`⚠️ 行 ${rowNumber} 缺少监护人身份,跳过`);
|
||||
return;
|
||||
}
|
||||
|
||||
// 构建用户额外数据
|
||||
const extraData = {
|
||||
row: rowNumber,
|
||||
guardianRole: guardianRole,
|
||||
childGrade: childGrade,
|
||||
childDescription: childDesc ? String(childDesc).substring(0, 500) : ''
|
||||
};
|
||||
|
||||
// 插入用户
|
||||
const result = insertUserStmt.run(uid, String(guardianRole), JSON.stringify(extraData));
|
||||
|
||||
if (result.changes > 0) {
|
||||
insertedCount++;
|
||||
const userId = result.lastInsertRowid;
|
||||
|
||||
// 为用户添加标签
|
||||
addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap);
|
||||
|
||||
if (rowCount % 30 === 0) {
|
||||
console.log(` 📝 已处理 ${rowCount} 行...`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`\n✅ 用户导入完成:${insertedCount} 条`);
|
||||
|
||||
// 更新所有标签的覆盖统计
|
||||
console.log('🔄 更新标签统计...');
|
||||
updateTagStats(db);
|
||||
|
||||
console.log('\n📊 数据统计:');
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM users) as total_users,
|
||||
(SELECT COUNT(*) FROM tags) as total_tags,
|
||||
(SELECT COUNT(*) FROM tag_categories) as total_categories
|
||||
`).get();
|
||||
|
||||
console.log(` • 总用户: ${stats.total_users}`);
|
||||
console.log(` • 总标签: ${stats.total_tags}`);
|
||||
console.log(` • 分类数: ${stats.total_categories}`);
|
||||
|
||||
db.close();
|
||||
|
||||
console.log('\n🎉 导入流程完成!\n');
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ 导入失败:', error.message);
|
||||
console.error(error.stack);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
function addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap) {
|
||||
// 监护人身份
|
||||
if (values[1]) {
|
||||
const role = String(values[1]).trim();
|
||||
const mapped = TAG_MAPPINGS.guardian_role[role] || role;
|
||||
const tagId = getOrCreateTag('guardian_role', mapped);
|
||||
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
|
||||
if (rowNumber <= 5) console.log(` [行${rowNumber}] 监护人身份: "${role}" -> "${mapped}" (tagId: ${tagId})`);
|
||||
}
|
||||
|
||||
// 文化程度
|
||||
if (values[2]) {
|
||||
const edu = String(values[2]).trim();
|
||||
const mapped = TAG_MAPPINGS.guardian_education[edu] || edu;
|
||||
const tagId = getOrCreateTag('guardian_education', mapped);
|
||||
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
|
||||
if (rowNumber <= 5) console.log(` [行${rowNumber}] 文化程度: "${edu}" -> "${mapped}" (tagId: ${tagId})`);
|
||||
}
|
||||
|
||||
// 职业(分类)
|
||||
if (values[3]) {
|
||||
const job = String(values[3]).trim().toLowerCase();
|
||||
let jobCategory = '其他';
|
||||
|
||||
// 简单的职业分类
|
||||
if (job.includes('教师') || job.includes('医生') || job.includes('工程') || job.includes('律师')) {
|
||||
jobCategory = '专业人士';
|
||||
} else if (job.includes('工人') || job.includes('工厂')) {
|
||||
jobCategory = '工人';
|
||||
} else if (job.includes('农') || job.includes('农民') || job.includes('务农')) {
|
||||
jobCategory = '农民';
|
||||
} else if (job.includes('员工') || job.includes('职员') || job.includes('公务') || job.includes('干部')) {
|
||||
jobCategory = '公司/政府工作人员';
|
||||
} else if (job.includes('退休') || job.includes('离退休')) {
|
||||
jobCategory = '退休人士';
|
||||
} else if (job.includes('个体') || job.includes('自由') || job.includes('经营')) {
|
||||
jobCategory = '个体户/自由职业';
|
||||
} else if (job.includes('商业') || job.includes('销售')) {
|
||||
jobCategory = '销售/商业';
|
||||
} else if (job.includes('家')) {
|
||||
jobCategory = '家务';
|
||||
}
|
||||
|
||||
const tagId = getOrCreateTag('guardian_occupation', jobCategory);
|
||||
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
|
||||
if (rowNumber <= 5) console.log(` [行${rowNumber}] 职业: "${job}" -> "${jobCategory}" (tagId: ${tagId})`);
|
||||
}
|
||||
|
||||
// 年龄分组
|
||||
if (values[4]) {
|
||||
const ageGroup = getAgeGroup(values[4]);
|
||||
const tagId = getOrCreateTag('guardian_age_group', ageGroup);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 第二监护人身份
|
||||
if (values[5]) {
|
||||
const role2 = String(values[5]).trim();
|
||||
if (role2 && role2 !== '无' && role2 !== '/') {
|
||||
const mapped = TAG_MAPPINGS.guardian_role[role2] || role2;
|
||||
const tagId = getOrCreateTag('second_guardian_role', mapped);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
}
|
||||
|
||||
// 孩子性别
|
||||
if (values[6]) {
|
||||
const gender = String(values[6]).trim();
|
||||
const mapped = TAG_MAPPINGS.child_gender[gender] || gender;
|
||||
const tagId = getOrCreateTag('child_gender', mapped);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 孩子学段
|
||||
if (values[7]) {
|
||||
const segment = gradeToSegment(values[7]);
|
||||
const tagId = getOrCreateTag('child_grade', segment);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 学习成绩
|
||||
if (values[8]) {
|
||||
const scoreStr = String(values[8]).trim();
|
||||
// 处理混合值
|
||||
const scores = scoreStr.split(/[、,]/).map(s => s.trim()).filter(s => s && !s.includes('null'));
|
||||
for (const score of scores) {
|
||||
const mapped = TAG_MAPPINGS.child_academic_score[score] || score;
|
||||
const tagId = getOrCreateTag('child_academic_score', mapped);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
}
|
||||
|
||||
// 家庭结构(关键词提取)
|
||||
if (values[9]) {
|
||||
const familyStr = String(values[9]).trim();
|
||||
const keywords = ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家', '多代'];
|
||||
const found = new Set();
|
||||
for (const kw of keywords) {
|
||||
if (familyStr.includes(kw) && !found.has(kw)) {
|
||||
found.add(kw);
|
||||
const tagId = getOrCreateTag('family_structure', kw);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
}
|
||||
// 如果没有识别任何关键词,用原始值
|
||||
if (found.size === 0) {
|
||||
const tagId = getOrCreateTag('family_structure', familyStr.substring(0, 50));
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
}
|
||||
|
||||
// 亲子关系
|
||||
if (values[10]) {
|
||||
const relationship = relationshipQuality(values[10]);
|
||||
const tagId = getOrCreateTag('parent_child_relationship', relationship);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 教育理念一致性
|
||||
if (values[11]) {
|
||||
const consensus = String(values[11]).trim();
|
||||
const mapped = TAG_MAPPINGS.education_consensus[consensus] || (consensus.includes('有') ? '有分歧' : '无分歧');
|
||||
const tagId = getOrCreateTag('education_consensus', mapped);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 是否否定孩子
|
||||
if (values[12]) {
|
||||
const negation = String(values[12]).trim();
|
||||
const mapped = TAG_MAPPINGS.child_negation[negation] || (negation.includes('是') || negation.includes('有') ? '是' : '否');
|
||||
const tagId = getOrCreateTag('child_negation', mapped);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 打骂教育
|
||||
if (values[13]) {
|
||||
const punishment = String(values[13]).trim();
|
||||
const mapped = TAG_MAPPINGS.physical_punishment[punishment] || (punishment.includes('有') ? '有' : '无');
|
||||
const tagId = getOrCreateTag('physical_punishment', mapped);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 孩子与父母同住
|
||||
if (values[14]) {
|
||||
const living = String(values[14]).trim();
|
||||
// 尝试映射,如果映射失败,尝试关键字匹配
|
||||
let mapped = TAG_MAPPINGS.child_living_with_parents[living];
|
||||
if (!mapped) {
|
||||
// 关键字匹配
|
||||
if (living.includes('是') && !living.includes('不是')) {
|
||||
mapped = '是';
|
||||
} else if (living.includes('否') || living.includes('不是')) {
|
||||
mapped = '否';
|
||||
} else {
|
||||
mapped = '是'; // 默认
|
||||
}
|
||||
}
|
||||
const tagId = getOrCreateTag('child_living_with_parents', mapped);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 参与养育人员 - 提取关键信息
|
||||
if (values[15]) {
|
||||
const caregiverStr = String(values[15]).trim();
|
||||
if (caregiverStr && caregiverStr !== '无' && caregiverStr !== '没有') {
|
||||
// 识别主要的养育者
|
||||
let caregiver = '其他';
|
||||
if (caregiverStr.includes('妈妈')) caregiver = '母亲';
|
||||
else if (caregiverStr.includes('父亲') || caregiverStr.includes('爸爸')) caregiver = '父亲';
|
||||
else if (caregiverStr.includes('爷爷')) caregiver = '祖父';
|
||||
else if (caregiverStr.includes('奶奶')) caregiver = '祖母';
|
||||
else if (caregiverStr.includes('外公')) caregiver = '外祖父';
|
||||
else if (caregiverStr.includes('外婆')) caregiver = '外祖母';
|
||||
else if (caregiverStr.includes('祖')) caregiver = '祖父母';
|
||||
else if (caregiverStr.includes('外')) caregiver = '外祖父母';
|
||||
|
||||
const tagId = getOrCreateTag('child_caregivers', caregiver);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
}
|
||||
|
||||
// 服务周期
|
||||
if (values[17]) {
|
||||
const duration = String(values[17]).trim();
|
||||
const mapped = TAG_MAPPINGS.service_duration[duration] || duration;
|
||||
const tagId = getOrCreateTag('service_duration', mapped);
|
||||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
}
|
||||
|
||||
function updateTagStats(db) {
|
||||
const tags = db.prepare(`SELECT id FROM tags`).all();
|
||||
const totalUsers = db.prepare(`SELECT COUNT(*) as n FROM users`).get().n;
|
||||
|
||||
for (const tag of tags) {
|
||||
const result = db.prepare(`
|
||||
SELECT COUNT(*) as n FROM user_tags WHERE tag_id = ?
|
||||
`).get(tag.id);
|
||||
|
||||
const coverage = result.n || 0;
|
||||
const coverageRate = totalUsers > 0 ? (coverage / totalUsers * 100).toFixed(2) : 0;
|
||||
|
||||
db.prepare(`
|
||||
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
|
||||
`).run(coverage, coverageRate, tag.id);
|
||||
}
|
||||
}
|
||||
|
||||
importCleanData();
|
||||
414
scripts/import-excel.js
Normal file
414
scripts/import-excel.js
Normal file
@@ -0,0 +1,414 @@
|
||||
/**
|
||||
* Excel 数据导入脚本 v2
|
||||
* 将"家庭教育档案-天数.xlsx"中的完整数据导入到数据库
|
||||
* 支持多维度标签分类
|
||||
*
|
||||
* 用法: node scripts/import-excel.js [path/to/file.xlsx]
|
||||
*/
|
||||
|
||||
const ExcelJS = require('exceljs');
|
||||
const path = require('path');
|
||||
const { getDb, initializeDatabase } = require('../db/init');
|
||||
|
||||
const EXCEL_FILE = process.argv[2] || path.join(__dirname, '../家庭教育档案-天数.xlsx');
|
||||
|
||||
// ────────────────────────────────────
|
||||
// 标签分类定义
|
||||
// ────────────────────────────────────
|
||||
const TAG_CATEGORIES = [
|
||||
// 1. 监护人信息
|
||||
{
|
||||
key: 'guardian_role',
|
||||
name: '监护人身份',
|
||||
color: '#3b82f6',
|
||||
column: 3 // C: 家庭角色
|
||||
},
|
||||
{
|
||||
key: 'guardian_education',
|
||||
name: '监护人文化程度',
|
||||
color: '#8b5cf6',
|
||||
column: 4 // D: 文化程度
|
||||
},
|
||||
{
|
||||
key: 'guardian1_personality',
|
||||
name: '监护人1性格特征',
|
||||
color: '#a78bfa',
|
||||
column: 7 // G: 性格特征
|
||||
},
|
||||
{
|
||||
key: 'guardian2_personality',
|
||||
name: '监护人2性格特征',
|
||||
color: '#c084fc',
|
||||
column: 14 // N: 性格特征_2
|
||||
},
|
||||
|
||||
// 2. 孩子信息
|
||||
{
|
||||
key: 'child_gender',
|
||||
name: '孩子性别',
|
||||
color: '#ec4899',
|
||||
column: 17 // Q: 性别
|
||||
},
|
||||
{
|
||||
key: 'child_personality',
|
||||
name: '孩子性格特征',
|
||||
color: '#f472b6',
|
||||
column: 20 // T: 孩子性格特征
|
||||
},
|
||||
{
|
||||
key: 'child_score',
|
||||
name: '孩子学习成绩',
|
||||
color: '#f59e0b',
|
||||
column: 21 // U: 学习成绩
|
||||
},
|
||||
|
||||
// 3. 家庭情况
|
||||
{
|
||||
key: 'family_structure',
|
||||
name: '家庭基本情况',
|
||||
color: '#06b6d4',
|
||||
column: 23 // W: 家庭基本情况(含"三代同堂"等)
|
||||
},
|
||||
{
|
||||
key: 'family_atmosphere',
|
||||
name: '家庭氛围',
|
||||
color: '#10b981',
|
||||
column: 24 // X: 家庭氛围
|
||||
},
|
||||
{
|
||||
key: 'parent_child_relation',
|
||||
name: '亲子关系',
|
||||
color: '#6366f1',
|
||||
column: 25 // Y: 亲子关系
|
||||
},
|
||||
|
||||
// 4. 教育行为
|
||||
{
|
||||
key: 'education_conflict',
|
||||
name: '教育理念一致性',
|
||||
column: 26 // Z: 家长有无教育分歧
|
||||
},
|
||||
{
|
||||
key: 'child_negation',
|
||||
name: '否定现象',
|
||||
column: 27 // AA: 是否经常否定孩子
|
||||
},
|
||||
{
|
||||
key: 'physical_punishment',
|
||||
name: '纪律方式',
|
||||
column: 28 // AB: 有无打骂教育
|
||||
},
|
||||
{
|
||||
key: 'child_with_parents',
|
||||
name: '亲子陪伴',
|
||||
column: 29 // AC: 孩子是否在父母身边长大
|
||||
},
|
||||
|
||||
// 5. 指导周期
|
||||
{
|
||||
key: 'duration',
|
||||
name: '指导周期',
|
||||
color: '#ef4444',
|
||||
column: 38 // AL: 天数
|
||||
}
|
||||
];
|
||||
|
||||
// 标签值映射(将Excel值转化为标签)
|
||||
const TAG_VALUE_MAP = {
|
||||
'guardian_role': {
|
||||
'母亲': '母亲',
|
||||
'妈妈': '母亲',
|
||||
'母': '母亲',
|
||||
'父亲': '父亲',
|
||||
'爸爸': '父亲',
|
||||
'奶奶': '奶奶',
|
||||
'爷爷': '爷爷',
|
||||
'外婆': '外婆',
|
||||
'外公': '外公',
|
||||
'姥姥': '外婆',
|
||||
'姥爷': '外公',
|
||||
'祖母': '奶奶',
|
||||
'大姐': '成年子女',
|
||||
'舅舅': '其他亲属',
|
||||
'妻子': '配偶'
|
||||
},
|
||||
'guardian_education': {
|
||||
'初中': '初中',
|
||||
'初小': '小学',
|
||||
'小学': '小学',
|
||||
'中师': '中专',
|
||||
'中专': '中专',
|
||||
'高中': '高中',
|
||||
'大专': '大专',
|
||||
'大学': '本科',
|
||||
'本科': '本科',
|
||||
'大学本科': '本科',
|
||||
'硕士': '硕士',
|
||||
'研究生': '硕士',
|
||||
'在职研究生': '硕士'
|
||||
},
|
||||
'child_gender': {
|
||||
'女': '女孩',
|
||||
'男': '男孩',
|
||||
'女、男': '双胞胎'
|
||||
},
|
||||
'child_score': {
|
||||
'优秀': '优秀',
|
||||
'良好': '良好',
|
||||
'一般': '一般',
|
||||
'差': '较差',
|
||||
'较差': '较差',
|
||||
'A': '优秀',
|
||||
'B': '良好',
|
||||
'C': '一般',
|
||||
'D': '较差'
|
||||
},
|
||||
'duration': {
|
||||
'60天': '60天课程',
|
||||
'180天': '180天课程',
|
||||
'90天': '90天课程',
|
||||
'365天': '365天课程'
|
||||
}
|
||||
};
|
||||
|
||||
// 需要进行关键词提取的字段
|
||||
const KEYWORD_EXTRACTION_FIELDS = {
|
||||
'family_structure': {
|
||||
column: 22,
|
||||
keywords: ['三代同堂', '四口之家', '三口之家', '单亲', '离异', '隔代抚养', '二代', '三代']
|
||||
}
|
||||
};
|
||||
|
||||
async function importExcelData() {
|
||||
try {
|
||||
console.log(`\n📂 读取 Excel 文件: ${EXCEL_FILE}`);
|
||||
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
await workbook.xlsx.readFile(EXCEL_FILE);
|
||||
|
||||
const worksheet = workbook.getWorksheet(1);
|
||||
if (!worksheet) {
|
||||
throw new Error('找不到工作表');
|
||||
}
|
||||
|
||||
console.log(`📊 总行数: ${worksheet.rowCount}`);
|
||||
|
||||
const db = getDb('onion');
|
||||
|
||||
// 初始化数据库
|
||||
initializeDatabase('onion');
|
||||
|
||||
// 创建所有标签分类
|
||||
console.log('🏗️ 建立分类体系...');
|
||||
const categoryMap = {};
|
||||
for (const cat of TAG_CATEGORIES) {
|
||||
const result = db.prepare(`
|
||||
INSERT OR IGNORE INTO tag_categories (key, name, sort_order, color)
|
||||
VALUES (?, ?, ?, ?)
|
||||
`).run(cat.key, cat.name, 0, cat.color || '#6366f1');
|
||||
|
||||
const catRecord = db.prepare(`
|
||||
SELECT id FROM tag_categories WHERE key = ?
|
||||
`).get(cat.key);
|
||||
categoryMap[cat.key] = catRecord.id;
|
||||
}
|
||||
|
||||
console.log(`✅ 创建了 ${Object.keys(categoryMap).length} 个分类`);
|
||||
|
||||
// 处理数据行
|
||||
let insertedCount = 0;
|
||||
const insertUserStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO users (uid, name, extra_json)
|
||||
VALUES (?, ?, ?)
|
||||
`);
|
||||
|
||||
const insertUserTagStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
VALUES (?, ?)
|
||||
`);
|
||||
|
||||
// 获取事先创建的标签ID映射
|
||||
const tagCache = {};
|
||||
|
||||
function getOrCreateTag(catKey, tagName) {
|
||||
if (!tagName || !catKey) return null;
|
||||
|
||||
const cacheKey = `${catKey}:${tagName}`;
|
||||
if (tagCache[cacheKey]) return tagCache[cacheKey];
|
||||
|
||||
// 生成唯一的key - 对于长文本(性格特征)使用简化版本
|
||||
let tagKey;
|
||||
const isPersonality = catKey.includes('personality');
|
||||
|
||||
if (isPersonality && tagName.length > 30) {
|
||||
// 对于长的性格特征,使用简化的标识符
|
||||
// 使用前20个字符 + 长度id
|
||||
const simplified = tagName.substring(0, 20).toLowerCase().replace(/\s+/g, '_').replace(/[^\w]/g, '');
|
||||
const hash = require('crypto').createHash('md5').update(tagName).digest('hex').substring(0, 8);
|
||||
tagKey = `${catKey}_${simplified}_${hash}`;
|
||||
} else {
|
||||
// 对于其他标签,使用原有方法
|
||||
tagKey = `${catKey}_${tagName.toLowerCase().replace(/\s+/g, '_').replace(/[^\w]/g, '')}`;
|
||||
}
|
||||
|
||||
const stmt = db.prepare(`
|
||||
SELECT id FROM tags WHERE key = ?
|
||||
`);
|
||||
let tag = stmt.get(tagKey);
|
||||
|
||||
if (!tag) {
|
||||
// 创建新标签
|
||||
db.prepare(`
|
||||
INSERT INTO tags (key, name, category_id, sort_order)
|
||||
VALUES (?, ?, ?, ?)
|
||||
`).run(tagKey, tagName, categoryMap[catKey], 0);
|
||||
|
||||
tag = stmt.get(tagKey);
|
||||
}
|
||||
|
||||
tagCache[cacheKey] = tag?.id;
|
||||
return tag?.id;
|
||||
}
|
||||
|
||||
// 遍历Excel数据行
|
||||
let rowCount = 0;
|
||||
worksheet.eachRow((row, rowNumber) => {
|
||||
if (rowNumber === 1) return; // 跳过表头
|
||||
|
||||
rowCount++;
|
||||
const values = row.values || [];
|
||||
|
||||
// 提取基本信息
|
||||
const fileName = values[1]; // 文件名称
|
||||
const childName = values[16]; // 孩子姓名
|
||||
|
||||
if (!fileName) {
|
||||
console.warn(`⚠️ 行 ${rowNumber} 缺少文件名,跳过`);
|
||||
return;
|
||||
}
|
||||
|
||||
// 构建用户额外数据
|
||||
const extraData = {
|
||||
fileName: fileName,
|
||||
childName: childName || '',
|
||||
guardian1Name: values[2],
|
||||
childAge: values[17],
|
||||
grade: values[19],
|
||||
learningScore: values[21],
|
||||
familyAddress: values[23],
|
||||
questionnaireSummary: values[37],
|
||||
};
|
||||
|
||||
// 插入用户
|
||||
const result = insertUserStmt.run(fileName, childName || fileName, JSON.stringify(extraData));
|
||||
|
||||
if (result.changes > 0) {
|
||||
insertedCount++;
|
||||
const userId = result.lastInsertRowid;
|
||||
|
||||
// 为用户添加标签
|
||||
addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt);
|
||||
|
||||
if (rowCount % 30 === 0) {
|
||||
console.log(` 📝 已处理 ${rowCount} 行...`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`\n✅ 用户导入完成:${insertedCount} 条`);
|
||||
|
||||
// 更新所有标签的覆盖统计
|
||||
console.log('🔄 更新标签统计...');
|
||||
updateTagStats(db);
|
||||
|
||||
console.log('\n📊 数据统计:');
|
||||
const stats = db.prepare(`
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM users) as total_users,
|
||||
(SELECT COUNT(*) FROM tags) as total_tags,
|
||||
(SELECT COUNT(*) FROM tag_categories) as total_categories
|
||||
`).get();
|
||||
|
||||
console.log(` • 总用户: ${stats.total_users}`);
|
||||
console.log(` • 总标签: ${stats.total_tags}`);
|
||||
console.log(` • 分类数: ${stats.total_categories}`);
|
||||
|
||||
db.close();
|
||||
|
||||
console.log('\n🎉 导入流程完成!\n');
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ 导入失败:', error.message);
|
||||
console.error(error.stack);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
function addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt) {
|
||||
for (const cat of TAG_CATEGORIES) {
|
||||
const colIdx = cat.column;
|
||||
if (colIdx >= values.length) continue;
|
||||
|
||||
let value = values[colIdx];
|
||||
if (!value) continue;
|
||||
|
||||
value = String(value).trim();
|
||||
|
||||
// 特殊处理学习成绩的混合值(分解"优秀、良好"为两个标签)
|
||||
if (cat.key === 'child_score' && value.includes('、')) {
|
||||
const scores = value.split('、').map(s => s.trim());
|
||||
for (const score of scores) {
|
||||
const mapped = TAG_VALUE_MAP[cat.key]?.[score] || score;
|
||||
const tagId = getOrCreateTag(cat.key, mapped);
|
||||
if (tagId) {
|
||||
insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// 处理值映射
|
||||
if (TAG_VALUE_MAP[cat.key] && TAG_VALUE_MAP[cat.key][value]) {
|
||||
value = TAG_VALUE_MAP[cat.key][value];
|
||||
}
|
||||
|
||||
// 获取或创建标签
|
||||
const tagId = getOrCreateTag(cat.key, value);
|
||||
if (tagId) {
|
||||
insertUserTagStmt.run(userId, tagId);
|
||||
}
|
||||
|
||||
// 处理关键词提取
|
||||
if (KEYWORD_EXTRACTION_FIELDS[cat.key]) {
|
||||
const keywords = KEYWORD_EXTRACTION_FIELDS[cat.key].keywords;
|
||||
for (const keyword of keywords) {
|
||||
if (value.includes(keyword)) {
|
||||
const kwTagId = getOrCreateTag(cat.key, keyword);
|
||||
if (kwTagId) {
|
||||
insertUserTagStmt.run(userId, kwTagId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function updateTagStats(db) {
|
||||
const tags = db.prepare(`SELECT id FROM tags`).all();
|
||||
const totalUsers = db.prepare(`SELECT COUNT(*) as n FROM users`).get().n;
|
||||
|
||||
for (const tag of tags) {
|
||||
const result = db.prepare(`
|
||||
SELECT COUNT(*) as n FROM user_tags WHERE tag_id = ?
|
||||
`).get(tag.id);
|
||||
|
||||
const coverage = result.n || 0;
|
||||
const coverageRate = totalUsers > 0 ? (coverage / totalUsers * 100).toFixed(2) : 0;
|
||||
|
||||
db.prepare(`
|
||||
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
|
||||
`).run(coverage, coverageRate, tag.id);
|
||||
}
|
||||
}
|
||||
|
||||
importExcelData();
|
||||
192
scripts/import-tags-from-v1.js
Normal file
192
scripts/import-tags-from-v1.js
Normal file
@@ -0,0 +1,192 @@
|
||||
/**
|
||||
* 从清洗1.0.xlsx 中导入标签数据到现有的清洗2.0 用户
|
||||
*
|
||||
* 策略:
|
||||
* 1. 读取清洗1.0.xlsx 的标签列(18-31)
|
||||
* 2. 尝试通过前7列数据匹配清洗2.0中的用户
|
||||
* 3. 导入匹配到的标签
|
||||
*/
|
||||
|
||||
const ExcelJS = require('exceljs');
|
||||
const path = require('path');
|
||||
const { getDb } = require('../db/init');
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
console.log('\n╔════════════════════════════════════════════════════════════════╗');
|
||||
console.log('║ 📥 从清洗1.0 导入标签数据 ║');
|
||||
console.log('╚════════════════════════════════════════════════════════════════╝\n');
|
||||
|
||||
// 标签分类与列的映射
|
||||
const TAG_COLUMN_MAP = {
|
||||
18: { catKey: 'user_identity', catName: '用户身份标签' },
|
||||
19: { catKey: 'user_age_group', catName: '用户年龄段标签' },
|
||||
20: { catKey: 'child_grade', catName: '孩子学段标签' },
|
||||
21: { catKey: 'family_structure', catName: '家庭结构标签' },
|
||||
22: { catKey: 'education_risk', catName: '教育风险标签' },
|
||||
23: { catKey: 'family_support', catName: '家庭支持度标签' },
|
||||
24: { catKey: 'payment_ability', catName: '付费能力标签' },
|
||||
25: { catKey: 'urgency', catName: '需求紧迫度标签' },
|
||||
26: { catKey: 'core_problem', catName: '核心问题标签' },
|
||||
27: { catKey: 'intervention_difficulty', catName: '干预难度标签' },
|
||||
28: { catKey: 'conversion_priority', catName: '转化优先级标签' },
|
||||
29: { catKey: 'service_duration', catName: '服务周期标签' },
|
||||
30: { catKey: 'channel_adaption', catName: '渠道适配标签' },
|
||||
31: { catKey: 'product_match', catName: '产品匹配标签' }
|
||||
};
|
||||
|
||||
const db = getDb('onion');
|
||||
|
||||
// 读取清洗1.0
|
||||
console.log('📖 读取清洗1.0.xlsx...');
|
||||
const wb1 = new ExcelJS.Workbook();
|
||||
await wb1.xlsx.readFile(path.join(__dirname, '../清洗1.0.xlsx'));
|
||||
const ws1 = wb1.worksheets[0];
|
||||
|
||||
// 读取清洗2.0
|
||||
console.log('📖 读取清洗2.0.xlsx...');
|
||||
const wb2 = new ExcelJS.Workbook();
|
||||
await wb2.xlsx.readFile(path.join(__dirname, '../清洗2.0.xlsx'));
|
||||
const ws2 = wb2.worksheets[0];
|
||||
|
||||
// 构建1.0的用户映射(前7列作为key)
|
||||
const map1 = {};
|
||||
const tagData1 = {};
|
||||
|
||||
ws1.eachRow((row, rowNum) => {
|
||||
if (rowNum === 1) return; // skip header
|
||||
|
||||
// 生成key
|
||||
const key = [1,2,3,4,5,6,7].map(c => {
|
||||
const v = row.values[c];
|
||||
return v ? String(v).trim() : '';
|
||||
}).join('|');
|
||||
|
||||
map1[key] = rowNum;
|
||||
|
||||
// 存储标签数据
|
||||
const tags = {};
|
||||
for (const [col, info] of Object.entries(TAG_COLUMN_MAP)) {
|
||||
const tagValue = row.values[parseInt(col)];
|
||||
if (tagValue && String(tagValue).trim() !== '') {
|
||||
if (!tags[info.catKey]) tags[info.catKey] = [];
|
||||
tags[info.catKey].push(String(tagValue).trim());
|
||||
}
|
||||
}
|
||||
tagData1[key] = tags;
|
||||
});
|
||||
|
||||
console.log(` • 清洗1.0 索引: ${Object.keys(map1).length} 行\n`);
|
||||
|
||||
// 匹配清洗2.0的用户
|
||||
let matched = 0;
|
||||
let tagInserted = 0;
|
||||
const tagCache = {};
|
||||
|
||||
const insertTagStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
|
||||
VALUES (?, ?, ?, 0, 0, 0)
|
||||
`);
|
||||
|
||||
const getTagIdStmt = db.prepare(`
|
||||
SELECT id FROM tags WHERE category_id = ? AND name = ?
|
||||
`);
|
||||
|
||||
const insertUserTagStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
VALUES (?, ?)
|
||||
`);
|
||||
|
||||
// 获取分类ID映射
|
||||
const catIdMap = {};
|
||||
const categories = db.prepare('SELECT id, key FROM tag_categories').all();
|
||||
for (const cat of categories) {
|
||||
catIdMap[cat.key] = cat.id;
|
||||
}
|
||||
|
||||
console.log('🔗 匹配清洗2.0用户...\n');
|
||||
|
||||
ws2.eachRow((row, rowNum) => {
|
||||
if (rowNum === 1) return; // skip header
|
||||
|
||||
const key = [1,2,3,4,5,6,7].map(c => {
|
||||
const v = row.values[c];
|
||||
return v ? String(v).trim() : '';
|
||||
}).join('|');
|
||||
|
||||
if (!map1[key]) return;
|
||||
|
||||
// 获取清洗2.0中的用户ID
|
||||
const userKey = `user_${rowNum}`;
|
||||
const user = db.prepare('SELECT id FROM users WHERE uid = ?').get(userKey);
|
||||
if (!user) return;
|
||||
|
||||
// 导入标签
|
||||
const tags = tagData1[key];
|
||||
for (const [catKey, tagValues] of Object.entries(tags)) {
|
||||
const catId = catIdMap[catKey];
|
||||
if (!catId) continue;
|
||||
|
||||
for (const tagValue of tagValues) {
|
||||
const cacheKey = `${catId}:${tagValue}`;
|
||||
let tagId = tagCache[cacheKey];
|
||||
|
||||
if (!tagId) {
|
||||
// 尝试获取存在的标签
|
||||
let existing = getTagIdStmt.get(catId, tagValue);
|
||||
if (existing) {
|
||||
tagId = existing.id;
|
||||
} else {
|
||||
// 创建新标签
|
||||
insertTagStmt.run(
|
||||
`${catKey}_${Math.random().toString(36).slice(2)}`,
|
||||
tagValue,
|
||||
catId
|
||||
);
|
||||
const result = getTagIdStmt.get(catId, tagValue);
|
||||
tagId = result.id;
|
||||
}
|
||||
tagCache[cacheKey] = tagId;
|
||||
}
|
||||
|
||||
if (tagId) {
|
||||
insertUserTagStmt.run(user.id, tagId);
|
||||
tagInserted++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matched++;
|
||||
if (matched % 500 === 0) {
|
||||
console.log(` ✓ 已匹配 ${matched} 行...`);
|
||||
}
|
||||
});
|
||||
|
||||
console.log(`\n✅ 标签导入完成:`);
|
||||
console.log(` • 匹配用户: ${matched}`);
|
||||
console.log(` • 导入标签链接: ${tagInserted}`);
|
||||
|
||||
// 显示统计
|
||||
console.log('\n📊 标签分布:');
|
||||
const tagStats = db.prepare(`
|
||||
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
|
||||
FROM tag_categories tc
|
||||
LEFT JOIN tags t ON tc.id = t.category_id
|
||||
LEFT JOIN user_tags ut ON t.id = ut.tag_id
|
||||
GROUP BY tc.id
|
||||
ORDER BY tc.id
|
||||
LIMIT 16
|
||||
`).all();
|
||||
|
||||
for (const stat of tagStats) {
|
||||
console.log(` • ${stat.name}: ${stat.tag_count} tags, ${stat.user_count || 0} users`);
|
||||
}
|
||||
|
||||
db.close();
|
||||
} catch (e) {
|
||||
console.error('❌ Error:', e.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
168
scripts/merge-tags-v2.js
Normal file
168
scripts/merge-tags-v2.js
Normal file
@@ -0,0 +1,168 @@
|
||||
const { getDb } = require('../db/init');
|
||||
const db = getDb('onion');
|
||||
|
||||
// 精确的家庭角色标签映射 - 基于实际数据
|
||||
const MERGE_MAPPING = {
|
||||
'家庭角色': {
|
||||
'妈妈': ['母亲', '母親', '孩子母亲', '孩子妈妈', '全职妈妈', '妈咪', '蚂妈', '妈妈一', '妈妈初', '妈妈大专', '母', '女主人', '母亲初初', '母亲中中中', '家庭主妇', '照孩子'],
|
||||
'父亲': ['爸爸', '父', '爸', '养父'],
|
||||
'奶奶': ['祖母'],
|
||||
'姥姥': ['姥爷'],
|
||||
'爷爷': ['祖父'],
|
||||
'外婆': ['外公'],
|
||||
}
|
||||
};
|
||||
|
||||
// 需要删除的错误标签(无实际意义或属于其他分类)
|
||||
const INVALID_TAGS = ['初中', '文 化', '*'];
|
||||
|
||||
function mergeTags() {
|
||||
try {
|
||||
console.log('🔄 开始合并同类标签...\n');
|
||||
|
||||
let totalMerged = 0;
|
||||
let totalDeleted = 0;
|
||||
|
||||
// 处理每个分类的映射
|
||||
for (const [categoryName, tagMappings] of Object.entries(MERGE_MAPPING)) {
|
||||
console.log(`\n📁 分类: ${categoryName}`);
|
||||
|
||||
// 获取分类ID
|
||||
const categoryResult = db.prepare(
|
||||
'SELECT id FROM tag_categories WHERE name = ?'
|
||||
).get(categoryName);
|
||||
|
||||
if (!categoryResult) {
|
||||
console.log(`❌ 无法找到分类: ${categoryName}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const categoryId = categoryResult.id;
|
||||
|
||||
// 处理每个主标签的映射
|
||||
for (const [masterTagName, synonyms] of Object.entries(tagMappings)) {
|
||||
console.log(`\n 主标签: ${masterTagName}`);
|
||||
|
||||
// 获取主标签
|
||||
const masterTag = db.prepare(
|
||||
'SELECT id FROM tags WHERE name = ? AND category_id = ?'
|
||||
).get(masterTagName, categoryId);
|
||||
|
||||
if (!masterTag) {
|
||||
console.log(` ❌ 主标签 "${masterTagName}" 不存在`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const masterTagId = masterTag.id;
|
||||
|
||||
// 合并每个同义词
|
||||
for (const synonym of synonyms) {
|
||||
const synonymTag = db.prepare(
|
||||
'SELECT id FROM tags WHERE name = ? AND category_id = ?'
|
||||
).get(synonym, categoryId);
|
||||
|
||||
if (!synonymTag) {
|
||||
console.log(` ⚠️ 同义词 "${synonym}" 不存在,跳过`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const synonymTagId = synonymTag.id;
|
||||
|
||||
// 获取同义词的用户数
|
||||
const userCountResult = db.prepare(
|
||||
'SELECT COUNT(DISTINCT user_id) as count FROM user_tags WHERE tag_id = ?'
|
||||
).get(synonymTagId);
|
||||
|
||||
const userCount = userCountResult?.count || 0;
|
||||
|
||||
// 转移用户关系到主标签
|
||||
db.prepare(
|
||||
`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`
|
||||
).run(masterTagId, synonymTagId);
|
||||
|
||||
// 删除同义词的所有关系
|
||||
db.prepare(
|
||||
'DELETE FROM user_tags WHERE tag_id = ?'
|
||||
).run(synonymTagId);
|
||||
|
||||
// 删除同义词标签记录
|
||||
db.prepare(
|
||||
'DELETE FROM tags WHERE id = ?'
|
||||
).run(synonymTagId);
|
||||
|
||||
console.log(` ✅ 合并 "${synonym}" (${userCount} 用户) → "${masterTagName}"`);
|
||||
totalMerged++;
|
||||
}
|
||||
|
||||
// 更新主标签的覆盖率
|
||||
const newCoverageResult = db.prepare(
|
||||
'SELECT COUNT(DISTINCT user_id) as count FROM user_tags WHERE tag_id = ?'
|
||||
).get(masterTagId);
|
||||
|
||||
const newCoverage = newCoverageResult?.count || 0;
|
||||
const totalUsers = 1929; // 从之前的统计
|
||||
const coverageRate = ((newCoverage / totalUsers) * 100).toFixed(2);
|
||||
|
||||
db.prepare(
|
||||
'UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?'
|
||||
).run(newCoverage, parseFloat(coverageRate), masterTagId);
|
||||
|
||||
console.log(` 📊 "${masterTagName}" 新覆盖: ${newCoverage} 用户 (${coverageRate}%)`);
|
||||
}
|
||||
}
|
||||
|
||||
// 删除无效标签
|
||||
console.log(`\n\n🗑️ 删除无效标签...`);
|
||||
for (const invalidTagName of INVALID_TAGS) {
|
||||
const invalidTag = db.prepare(
|
||||
'SELECT id FROM tags WHERE name = ?'
|
||||
).get(invalidTagName);
|
||||
|
||||
if (invalidTag) {
|
||||
db.prepare('DELETE FROM user_tags WHERE tag_id = ?').run(invalidTag.id);
|
||||
db.prepare('DELETE FROM tags WHERE id = ?').run(invalidTag.id);
|
||||
console.log(` ✅ 删除无效标签: "${invalidTagName}"`);
|
||||
totalDeleted++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\n\n✨ 合并完成!`);
|
||||
console.log(`📊 统计:`);
|
||||
console.log(` • 合并的同义词: ${totalMerged} 个`);
|
||||
console.log(` • 删除的无效标签: ${totalDeleted} 个`);
|
||||
|
||||
// 显示合并前后统计
|
||||
const tagCountResult = db.prepare('SELECT COUNT(*) as count FROM tags').get();
|
||||
const userTagCountResult = db.prepare('SELECT COUNT(*) as count FROM user_tags').get();
|
||||
|
||||
console.log(` • 剩余标签总数: ${tagCountResult.count}`);
|
||||
console.log(` • 用户-标签关系总数: ${userTagCountResult.count}`);
|
||||
|
||||
// 显示家庭角色分类的最新状态
|
||||
console.log(`\n📋 家庭角色分类的最新状态:`);
|
||||
const finalTags = db.prepare(
|
||||
`SELECT name, coverage, coverage_rate
|
||||
FROM tags
|
||||
WHERE category_id = (SELECT id FROM tag_categories WHERE name = '家庭角色')
|
||||
ORDER BY coverage DESC`
|
||||
).all();
|
||||
|
||||
finalTags.forEach((tag) => {
|
||||
console.log(` • ${tag.name}: ${tag.coverage} 用户 (${tag.coverage_rate}%)`);
|
||||
});
|
||||
|
||||
console.log(`\n✨ 总计: ${finalTags.length} 个家庭角色标签`);
|
||||
console.log(`\n💡 提示: 请执行以下命令重启服务器以清除缓存:`);
|
||||
console.log(` pkill -f "node server.js" && sleep 2 && node server.js &\n`);
|
||||
|
||||
db.close();
|
||||
process.exit(0);
|
||||
} catch (error) {
|
||||
console.error('❌ 错误:', error);
|
||||
db.close();
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
mergeTags();
|
||||
144
scripts/merge-tags.js
Normal file
144
scripts/merge-tags.js
Normal file
@@ -0,0 +1,144 @@
|
||||
/**
|
||||
* 合并同义标签脚本
|
||||
* 定义同义词映射,将重复标签合并到主标签
|
||||
*/
|
||||
|
||||
const { getDb } = require('../db/init');
|
||||
|
||||
// 定义各分类的同义词映射
|
||||
// 格式: { master_tag: [synonym1, synonym2, ...] }
|
||||
const MERGE_MAPPING = {
|
||||
// 家庭角色 - 保留简洁、规范的版本
|
||||
'家庭角色': {
|
||||
'妈妈': ['母亲', '母親', '孩子母亲', '孩子妈妈', '全职妈妈', '妈咪', '蚂妈', '妈妈一', '妈妈初', '妈妈大专', '妈', '女主人'],
|
||||
'爸爸': ['父亲', '父', '爸'],
|
||||
'奶奶': ['祖母'],
|
||||
'爷爷': ['祖父'],
|
||||
'外婆': ['外公 alternate'], // 外公是另一个性别
|
||||
'姥姥': ['姥爷'],
|
||||
},
|
||||
// 其他分类暂不合并
|
||||
};
|
||||
|
||||
async function mergeTags() {
|
||||
const db = getDb('onion');
|
||||
|
||||
try {
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log('🔗 开始合并同义标签');
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
let totalMerged = 0;
|
||||
let totalDeleted = 0;
|
||||
|
||||
for (const [categoryName, mapping] of Object.entries(MERGE_MAPPING)) {
|
||||
console.log(`\n📂 处理分类: ${categoryName}`);
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
// 获取分类ID
|
||||
const category = db.prepare(`
|
||||
SELECT id FROM tag_categories WHERE name = ?
|
||||
`).get(categoryName);
|
||||
|
||||
if (!category) {
|
||||
console.log(` ⚠️ 分类不存在`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const categoryId = category.id;
|
||||
|
||||
// 处理每个主标签的同义词列表
|
||||
for (const [masterName, synonyms] of Object.entries(mapping)) {
|
||||
// 获取主标签
|
||||
const masterTag = db.prepare(`
|
||||
SELECT id, coverage FROM tags
|
||||
WHERE category_id = ? AND name = ?
|
||||
`).get(categoryId, masterName);
|
||||
|
||||
if (!masterTag) {
|
||||
console.log(` ⚠️ 主标签不存在: ${masterName}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(`\n ✓ 主标签: ${masterName} (ID: ${masterTag.id}, 用户数: ${masterTag.coverage})`);
|
||||
|
||||
// 处理每个同义词
|
||||
for (const synonym of synonyms) {
|
||||
const synonymTag = db.prepare(`
|
||||
SELECT id, coverage FROM tags
|
||||
WHERE category_id = ? AND name = ?
|
||||
`).get(categoryId, synonym);
|
||||
|
||||
if (!synonymTag) {
|
||||
console.log(` • ${synonym} (不存在,跳过)`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.log(` • 合并 ${synonym} (ID: ${synonymTag.id}, 用户数: ${synonymTag.coverage})`);
|
||||
|
||||
// 1. 将同义标签的所有用户关系转移到主标签
|
||||
const moveStmt = db.prepare(`
|
||||
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
|
||||
SELECT user_id, ? FROM user_tags WHERE tag_id = ?
|
||||
`);
|
||||
moveStmt.run(masterTag.id, synonymTag.id);
|
||||
|
||||
// 2. 删除同义标签的所有用户关系
|
||||
db.prepare('DELETE FROM user_tags WHERE tag_id = ?').run(synonymTag.id);
|
||||
|
||||
// 3. 删除同义标签
|
||||
db.prepare('DELETE FROM tags WHERE id = ?').run(synonymTag.id);
|
||||
|
||||
totalMerged++;
|
||||
totalDeleted++;
|
||||
}
|
||||
|
||||
// 更新主标签的统计信息
|
||||
const newCoverage = db.prepare(`
|
||||
SELECT COUNT(DISTINCT user_id) as cnt FROM user_tags WHERE tag_id = ?
|
||||
`).get(masterTag.id);
|
||||
|
||||
const coverage = newCoverage.cnt || 0;
|
||||
const totalUsers = db.prepare('SELECT COUNT(*) as n FROM users').get().n;
|
||||
const coverage_rate = totalUsers > 0 ? +(coverage / totalUsers * 100).toFixed(2) : 0;
|
||||
|
||||
db.prepare(`
|
||||
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
|
||||
`).run(coverage, coverage_rate, masterTag.id);
|
||||
|
||||
console.log(` ✅ 更新主标签统计: ${coverage} 用户 (${coverage_rate}%)`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n' + '='.repeat(70));
|
||||
console.log(`✅ 合并完成`);
|
||||
console.log(` • 合并数量: ${totalMerged} 个同义标签`);
|
||||
console.log(` • 删除数量: ${totalDeleted} 个重复标签`);
|
||||
console.log('='.repeat(70) + '\n');
|
||||
|
||||
// 显示合并后的统计
|
||||
console.log('📊 合并后的分类统计:');
|
||||
const stats = db.prepare(`
|
||||
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count,
|
||||
ROUND(COUNT(DISTINCT ut.user_id) * 100.0 / (SELECT COUNT(*) FROM users), 1) as coverage
|
||||
FROM tag_categories tc
|
||||
LEFT JOIN tags t ON tc.id = t.category_id
|
||||
LEFT JOIN user_tags ut ON t.id = ut.tag_id
|
||||
GROUP BY tc.id
|
||||
ORDER BY tc.sort_order
|
||||
`).all();
|
||||
|
||||
for (const stat of stats) {
|
||||
console.log(` • ${stat.name.padEnd(20)}: ${stat.tag_count} tags, ${stat.user_count || 0} users (${stat.coverage || 0}%)`);
|
||||
}
|
||||
|
||||
db.close();
|
||||
} catch (e) {
|
||||
console.error('❌ 错误:', e.message);
|
||||
console.error(e);
|
||||
db.close();
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
mergeTags();
|
||||
124
scripts/quality-check-1.py
Normal file
124
scripts/quality-check-1.py
Normal file
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env python3
|
||||
"""全面质量检查脚本"""
|
||||
import openpyxl
|
||||
import sqlite3
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("🔍 全面质量检查")
|
||||
print("="*70 + "\n")
|
||||
|
||||
# ============================================================================
|
||||
# 1. Excel 文件对比
|
||||
# ============================================================================
|
||||
print("1️⃣ EXCEL 文件结构和内容对比")
|
||||
print("-"*70 + "\n")
|
||||
|
||||
wb0 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/家庭教育档案-天数.xlsx')
|
||||
wb1 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗1.0.xlsx')
|
||||
wb2 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗2.0.xlsx')
|
||||
|
||||
ws0, ws1, ws2 = wb0.active, wb1.active, wb2.active
|
||||
|
||||
print(f"📊 行列统计:")
|
||||
print(f" 原始(家庭教育档案-天数): {ws0.max_row} rows × {ws0.max_column} cols")
|
||||
print(f" 清洗1.0: {ws1.max_row} rows × {ws1.max_column} cols")
|
||||
print(f" 清洗2.0: {ws2.max_row} rows × {ws2.max_column} cols")
|
||||
|
||||
# 列结构对比
|
||||
print(f"\n📋 列结构对比:")
|
||||
print(f" {'列':<3} {'原始':<25} {'清洗1.0':<25} {'清洗2.0':<25} {'状态':<5}")
|
||||
print(f" {'-'*3} {'-'*25} {'-'*25} {'-'*25} {'-'*5}")
|
||||
|
||||
for col in range(1, 17):
|
||||
h0 = str(ws0.cell(1, col).value or '')[:22]
|
||||
h1 = str(ws1.cell(1, col).value or '')[:22]
|
||||
h2 = str(ws2.cell(1, col).value or '')[:22]
|
||||
match = "✓" if h1 == h2 else "✗"
|
||||
print(f" {col:<3} {h0:<25} {h1:<25} {h2:<25} {match:<5}")
|
||||
|
||||
# 数据完整性检查
|
||||
print(f"\n✅ 数据完整性 (前100行检查):")
|
||||
|
||||
def check_null_rate(ws, start_col=1, end_col=16, rows=100):
|
||||
results = {}
|
||||
for col in range(start_col, min(end_col + 1, ws.max_column + 1)):
|
||||
nulls = 0
|
||||
total = 0
|
||||
for row in range(2, min(rows + 2, ws.max_row + 1)):
|
||||
total += 1
|
||||
if ws.cell(row, col).value is None:
|
||||
nulls += 1
|
||||
if total > 0:
|
||||
results[col] = (nulls, total, 100 * nulls / total)
|
||||
return results
|
||||
|
||||
nulls1 = check_null_rate(ws1)
|
||||
nulls2 = check_null_rate(ws2)
|
||||
|
||||
print(f" 清洗1.0: ", end="")
|
||||
if all(rate == 0 for _, _, rate in nulls1.values()):
|
||||
print("✓ 完全无缺失值")
|
||||
else:
|
||||
for col, (n, t, rate) in sorted(nulls1.items()):
|
||||
if rate > 0:
|
||||
print(f"列{col}({rate:.0f}%) ", end="")
|
||||
|
||||
print(f"\n 清洗2.0: ", end="")
|
||||
if all(rate == 0 for _, _, rate in nulls2.values()):
|
||||
print("✓ 完全无缺失值")
|
||||
else:
|
||||
for col, (n, t, rate) in sorted(nulls2.items()):
|
||||
if rate > 0:
|
||||
print(f"列{col}({rate:.0f}%) ", end="")
|
||||
print()
|
||||
|
||||
# ============================================================================
|
||||
# 2. 数据库内容检查
|
||||
# ============================================================================
|
||||
print(f"\n\n2️⃣ 数据库内容检查")
|
||||
print("-"*70 + "\n")
|
||||
|
||||
conn = sqlite3.connect('/Users/inkling/Desktop/dmp/dmp_onion.db')
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 用户数据
|
||||
cursor.execute('SELECT COUNT(*) FROM users')
|
||||
user_count = cursor.fetchone()[0]
|
||||
print(f"👥 用户数: {user_count}")
|
||||
|
||||
# 标签数据
|
||||
cursor.execute('SELECT COUNT(*) FROM tags')
|
||||
tag_count = cursor.fetchone()[0]
|
||||
print(f"🏷️ 标签数: {tag_count}")
|
||||
|
||||
# 分类数据
|
||||
cursor.execute('SELECT COUNT(*) FROM tag_categories')
|
||||
cat_count = cursor.fetchone()[0]
|
||||
print(f"📂 分类数: {cat_count}")
|
||||
|
||||
# 关系数据
|
||||
cursor.execute('SELECT COUNT(*) FROM user_tags')
|
||||
rel_count = cursor.fetchone()[0]
|
||||
print(f"🔗 关系数: {rel_count}")
|
||||
|
||||
# 分类分布
|
||||
print(f"\n📊 标签分类分布:")
|
||||
cursor.execute('''
|
||||
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count,
|
||||
COUNT(DISTINCT ut.user_id) as user_count,
|
||||
COUNT(ut.id) as rel_count
|
||||
FROM tag_categories tc
|
||||
LEFT JOIN tags t ON tc.id = t.category_id
|
||||
LEFT JOIN user_tags ut ON t.id = ut.tag_id
|
||||
GROUP BY tc.id
|
||||
ORDER BY tc.id
|
||||
''')
|
||||
|
||||
for row in cursor.fetchall():
|
||||
name, tags, users, rels = row
|
||||
coverage = f"{(users*100/user_count):.0f}%" if users else "0%"
|
||||
print(f" • {name:<20} {tags:3d} tags, {users:4d} users ({coverage:>3s}), {rels:5d} relations")
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "="*70)
|
||||
Reference in New Issue
Block a user