Update README and project cleanup

This commit is contained in:
inkling
2026-04-08 14:52:09 +08:00
commit fafd267288
71 changed files with 14865 additions and 0 deletions

70
scripts/analyze-excel.py Normal file
View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python3
import openpyxl
# Load both files
wb1 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗1.0.xlsx')
wb2 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗2.0.xlsx')
ws1 = wb1.active
ws2 = wb2.active
# Get first row of data from each file
print("清洗1.0 - First 3 users (columns 1-7):")
for row in range(2, 5):
cols = []
for col in range(1, 8):
cols.append(ws1.cell(row, col).value)
print(f" Row {row}: {cols}")
print("\n清洗2.0 - First 3 users (columns 1-7):")
for row in range(2, 5):
cols = []
for col in range(1, 8):
cols.append(ws2.cell(row, col).value)
print(f" Row {row}: {cols}")
# Check if same users exist
print("\n清洗1.0 中的家庭角色值:")
roles_1 = set()
for row in range(2, ws1.max_row + 1):
val = ws1.cell(row, 1).value
if val:
roles_1.add(str(val).strip())
print(f"Unique values: {len(roles_1)}")
print("\n清洗2.0 中的家庭角色值:")
roles_2 = set()
for row in range(2, ws2.max_row + 1):
val = ws2.cell(row, 1).value
if val:
roles_2.add(str(val).strip())
print(f"Unique values: {len(roles_2)}")
print(f"\nOverlap analysis:")
print(f"Matching roles: {len(roles_1 & roles_2)}")
print(f"Unique to 1.0: {len(roles_1 - roles_2)}")
print(f"Unique to 2.0: {len(roles_2 - roles_1)}")
# Check column mapping - create a unique key per row from columns 1-7
def make_key(ws, row):
key_parts = []
for col in range(1, 8):
val = ws.cell(row, col).value
key_parts.append(str(val) if val is not None else "")
return "|".join(key_parts)
print("\nChecking row overlap by first 7 columns:")
keys_1 = set()
for row in range(2, ws1.max_row + 1):
keys_1.add(make_key(ws1, row))
keys_2 = set()
for row in range(2, ws2.max_row + 1):
keys_2.add(make_key(ws2, row))
overlap = len(keys_1 & keys_2)
print(f"Matching rows: {overlap}")
print(f"Total rows 1.0: {len(keys_1)}")
print(f"Total rows 2.0: {len(keys_2)}")

View File

@@ -0,0 +1,223 @@
const { getDb } = require('../db/init');
const db = getDb('onion');
const CATEGORY_KEY = 'basic_info_role';
const RENAME_MAP = new Map([
// 妈妈系
['母', '妈妈'],
['妈', '妈妈'],
['母亲', '妈妈'],
['母 亲', '妈妈'],
['母親', '妈妈'],
['毋亲', '妈妈'],
['妈 妈', '妈妈'],
['妈吗', '妈妈'],
['妈好', '妈妈'],
['妈专', '妈妈'],
['蚂妈', '妈妈'],
['宝妈', '妈妈'],
['全职妈妈', '妈妈'],
['家庭主妇', '妈妈'],
['主妇', '妈妈'],
['家家庭主妇', '妈妈'],
['女主人', '妈妈'],
// 爸爸系
['父', '爸爸'],
['爸', '爸爸'],
['父亲', '爸爸'],
['父 亲', '爸爸'],
['孩子爸', '爸爸'],
['爸专', '爸爸'],
['爸备', '爸爸'],
// 祖辈系
['祖父', '爷爷'],
['姥爷', '外公'],
['外爷', '外公'],
['祖母', '奶奶'],
['姥姥', '外婆'],
['姥姥/外婆', '外婆'],
['外婆', '外婆'],
['婆婆', '奶奶'],
// 其他明确亲属
['姑妈', '姑姑'],
]);
// 这些值属于家庭角色中的明确亲属关系,保留即可
const KEEP_SET = new Set([
'妈妈', '爸爸', '爷爷', '奶奶', '外公', '外婆',
'姑姑', '舅舅', '姨妈', '伯娘', '继母', '妻子',
'女儿', '儿子', '姐姐', '父母', '家长', '其他监护人',
]);
// 明显不是家庭角色的噪声、描述、乱码、占位符
const DELETE_EXACT = new Set([
'上班族', '母性', '女', '主', '主妇', '全职', '母中', '母女', '母子',
'一般', '陪读', '父母', '母家', '高中', '经济', '无', '目前', '内勤',
'带娃', '白黑', '家长', '全能', '次', '普通', '好人', '主导', '主角',
'主内', '主&角初中', '初中', '文 化', '/', 'I', '13296773713',
'盛自根', '经济支柱', '经济、教育、生活是核心', '助推庭教育',
'呵护,做具体事', '教育陪伴孩子', '照孩子', '家庭主妇', '家家庭主妇',
'妈专', '妈好', '妈吗', '妈 妈', '父 亲', '妈 亲', '母 亲', '母親',
'母', '父', '爸', '孩子爸', '爸专', '爸备', '宝妈', '蚂妈', '毋亲',
'外爷', '姥爷', '祖父', '祖母', '姑妈', '婆婆', '女主人', '母亲',
]);
const DELETE_PATTERNS = [
/^\d+$/, // 数字
/^[\s\W_]+$/, // 纯符号/空白
/联系方式|电话|手机号|微信/, // 联系方式片段
/上班|内勤|经济|教育|陪伴|助推|呵护|主导|主角|全能|普通|一般|目前|无|好人|次/,
/家庭主妇|主妇|全职|陪读|带娃/,
/文化|初中|高中|白黑|盛自根/,
];
function canonicalizeName(rawName) {
const name = String(rawName || '').trim();
if (!name) return null;
if (RENAME_MAP.has(name)) return RENAME_MAP.get(name);
return name;
}
function shouldDelete(name) {
if (DELETE_EXACT.has(name)) return true;
return DELETE_PATTERNS.some((re) => re.test(name));
}
function updateStats(dbConn) {
const totalUsers = dbConn.prepare('SELECT COUNT(*) AS n FROM users').get().n || 1;
const tags = dbConn.prepare('SELECT id FROM tags').all();
const stmt = dbConn.prepare(`
UPDATE tags
SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
WHERE id = ?
`);
for (const tag of tags) stmt.run(totalUsers, tag.id);
}
function main() {
try {
const category = db.prepare('SELECT id FROM tag_categories WHERE key = ?').get(CATEGORY_KEY);
if (!category) throw new Error(`找不到分类: ${CATEGORY_KEY}`);
const catId = category.id;
const tags = db.prepare('SELECT id, name FROM tags WHERE category_id = ?').all(catId);
console.log('🧹 开始清理家庭角色噪声数据...');
console.log(`📂 当前标签数: ${tags.length}`);
let merged = 0;
let deleted = 0;
let kept = 0;
const tx = db.transaction(() => {
const getByName = db.prepare('SELECT id, name FROM tags WHERE category_id = ? AND name = ?');
const insertRel = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
const deleteRel = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
const deleteTag = db.prepare('DELETE FROM tags WHERE id = ?');
const updateTag = db.prepare('UPDATE tags SET name = ? WHERE id = ?');
for (const tag of tags) {
const originalName = String(tag.name || '').trim();
const canonicalName = canonicalizeName(originalName);
if (KEEP_SET.has(originalName)) {
kept += 1;
continue;
}
if (canonicalName && canonicalName !== originalName && KEEP_SET.has(canonicalName)) {
const target = getByName.get(catId, canonicalName);
if (target) {
// 先把关系迁移过去,再删除旧标签
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并: ${originalName} -> ${canonicalName}`);
merged += 1;
}
continue;
}
// 未在保留名单中:如果是明显噪声则删除
if (shouldDelete(originalName)) {
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`🗑️ 删除: ${originalName}`);
deleted += 1;
continue;
}
// 其他未明确规则的值:保守处理,保留但不改名
kept += 1;
}
// 额外处理:把一些未能通过 canonicalize 但明显可归类到妈妈/爸爸的值再扫一遍
const leftovers = db.prepare('SELECT id, name FROM tags WHERE category_id = ?').all(catId);
for (const tag of leftovers) {
const name = String(tag.name || '').trim();
if (KEEP_SET.has(name)) continue;
if (/妈|母|宝妈/.test(name)) {
const target = getByName.get(catId, '妈妈');
if (target && target.id !== tag.id) {
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并(兜底): ${name} -> 妈妈`);
merged += 1;
continue;
}
}
if (/爸|父|孩子爸/.test(name)) {
const target = getByName.get(catId, '爸爸');
if (target && target.id !== tag.id) {
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并(兜底): ${name} -> 爸爸`);
merged += 1;
continue;
}
}
}
});
tx();
updateStats(db);
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM tags t JOIN tag_categories c ON c.id=t.category_id WHERE c.key = ?) AS tag_count,
(SELECT COUNT(*) FROM user_tags) AS rel_count,
(SELECT COUNT(*) FROM tags WHERE name = '妈妈') AS mom_count,
(SELECT COUNT(*) FROM tags WHERE name = '爸爸') AS dad_count,
(SELECT COUNT(*) FROM tags WHERE name = '爷爷') AS grandpa_count,
(SELECT COUNT(*) FROM tags WHERE name = '外公') AS mgp_count,
(SELECT COUNT(*) FROM tags WHERE name = '外婆') AS mgm_count
`).get(CATEGORY_KEY);
console.log('\n✨ 清理完成');
console.log(` • 合并: ${merged}`);
console.log(` • 删除: ${deleted}`);
console.log(` • 保留(未改名): ${kept}`);
console.log(` • 家庭角色标签剩余: ${stats.tag_count}`);
console.log(` • 妈妈/爸爸/爷爷/外公/外婆 计数: ${stats.mom_count}/${stats.dad_count}/${stats.grandpa_count}/${stats.mgp_count}/${stats.mgm_count}`);
db.close();
} catch (error) {
console.error('❌ 清理失败:', error);
try { db.close(); } catch (_) {}
process.exit(1);
}
}
main();

View File

@@ -0,0 +1,74 @@
const { getDb } = require('../db/init');
const db = getDb('onion');
// 清理明显是错误或不相关的标签(家庭角色分类中)
const FAMILY_ROLE_INVALID_TAGS = [
'初中', // 学段标签,不是家庭角色
'大姐', // 不是主要家庭角色
'舅舅', // 叔舅角色,范围太小
'妻子', // 不是孩子相关的家庭角色
'母亲相当单亲家庭', // 错误数据
'母子', // 不是标准家庭角色
'女儿', // 这应该在不同分类
'文 化', // 完全无关
'', // 符号
];
function cleanupInvalidTags() {
try {
console.log('🧹 开始清理无效标签...\n');
let deletedCount = 0;
// 删除标签
for (const tagName of FAMILY_ROLE_INVALID_TAGS) {
const tag = db.prepare('SELECT id FROM tags WHERE name = ?').get(tagName);
if (tag) {
const userCount = db.prepare(
'SELECT COUNT(DISTINCT user_id) as count FROM user_tags WHERE tag_id = ?'
).get(tag.id);
db.prepare('DELETE FROM user_tags WHERE tag_id = ?').run(tag.id);
db.prepare('DELETE FROM tags WHERE id = ?').run(tag.id);
console.log(`✅ 删除: "${tagName}" (${userCount?.count || 0} 用户)`);
deletedCount++;
}
}
console.log(`\n✨ 清理完成deleted: ${deletedCount}`);
// 显示最终状态
const finalCount = db.prepare('SELECT COUNT(*) as count FROM tags').get();
const relationCount = db.prepare('SELECT COUNT(*) as count FROM user_tags').get();
console.log(`\n📊 最终状态:`);
console.log(` • 剩余标签总数: ${finalCount.count}`);
console.log(` • 用户-标签关系总数: ${relationCount.count}`);
// 显示家庭角色分类的最新标签
console.log(`\n📋 家庭角色分类标签列表:`);
const finalTags = db.prepare(
`SELECT name, coverage, coverage_rate
FROM tags
WHERE category_id = (SELECT id FROM tag_categories WHERE name = '家庭角色')
ORDER BY coverage DESC`
).all();
finalTags.forEach((tag, idx) => {
console.log(` ${idx + 1}. ${tag.name}: ${tag.coverage} 用户 (${tag.coverage_rate}%)`);
});
console.log(`\n✨ 总计: ${finalTags.length} 个家庭角色标签`);
db.close();
process.exit(0);
} catch (error) {
console.error('❌ 错误:', error);
db.close();
process.exit(1);
}
}
cleanupInvalidTags();

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env node
const Database = require('better-sqlite3');
const path = require('path');
const dbPath = path.join(__dirname, '../dmp_onion.db');
const db = new Database(dbPath);
console.log('修复分类顺序...\n');
// 重新设置所有sort_order
const updates = [
{ id: 46, sort: 0, name: '家庭角色' },
{ id: 34, sort: 1, name: '用户年龄段标签' },
{ id: 35, sort: 2, name: '孩子学段标签' },
{ id: 36, sort: 3, name: '家庭结构标签' },
{ id: 37, sort: 4, name: '教育风险标签' },
{ id: 38, sort: 5, name: '家庭支持度标签' },
{ id: 39, sort: 6, name: '付费能力标签' },
{ id: 40, sort: 7, name: '需求紧迫度标签' },
{ id: 41, sort: 8, name: '核心问题标签' },
{ id: 42, sort: 9, name: '干预难度标签' },
{ id: 43, sort: 10, name: '转化优先级标签' },
{ id: 44, sort: 11, name: '渠道适配标签' },
{ id: 45, sort: 12, name: '产品匹配标签' },
{ id: 47, sort: 13, name: '文化程度' },
{ id: 48, sort: 14, name: '服务周期标签' }
];
const stmt = db.prepare('UPDATE tag_categories SET sort_order = ? WHERE id = ?');
for (const item of updates) {
stmt.run(item.sort, item.id);
console.log(`${item.sort + 1}. ${item.name}`);
}
console.log('\n✅ 完成!');
db.close();

View File

@@ -0,0 +1,94 @@
#!/usr/bin/env node
/**
* 修复分类重复问题
* 1. 删除"用户身份标签"分类及其所有标签和关系
* 2. 把"家庭角色"移到第一个位置
* 3. 调整其他分类的sort_order
*/
const Database = require('better-sqlite3');
const path = require('path');
const dbPath = path.join(__dirname, '../dmp_onion.db');
const db = new Database(dbPath);
console.log('\n╔════════════════════════════════════════════════════════════════╗');
console.log('║ 🔧 修复分类重复问题 ║');
console.log('╚════════════════════════════════════════════════════════════════╝\n');
try {
// 1. 获取用户身份标签的所有标签ID
console.log('1⃣ 获取\"用户身份标签\"的所有标签...');
const tagIds = db.prepare('SELECT id FROM tags WHERE category_id = 33').all();
console.log(` 找到 ${tagIds.length} 个标签`);
// 2. 删除相关的user_tags关系
console.log('\n2⃣ 删除user_tags关系...');
const stmt = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
let relDeleted = 0;
for (const tag of tagIds) {
const result = stmt.run(tag.id);
relDeleted += result.changes;
}
console.log(` 删除了 ${relDeleted} 条关系`);
// 3. 删除tags
console.log('\n3⃣ 删除标签...');
const tagDeleteResult = db.prepare('DELETE FROM tags WHERE category_id = 33').run();
console.log(` 删除了 ${tagDeleteResult.changes} 个标签`);
// 4. 删除分类
console.log('\n4⃣ 删除分类...');
const catDeleteResult = db.prepare('DELETE FROM tag_categories WHERE id = 33').run();
console.log(` 删除了 ${catDeleteResult.changes} 个分类`);
// 5. 更新家庭角色的sort_order到0
console.log('\n5⃣ 更新\"家庭角色\"的位置...');
db.prepare('UPDATE tag_categories SET sort_order = 0 WHERE id = 46').run();
console.log(' ✓ 家庭角色现在排在第一位');
// 6. 重新调整其他分类的sort_order
console.log('\n6⃣ 重新调整其他分类的顺序...');
const categories = db.prepare('SELECT id, key, name, sort_order FROM tag_categories ORDER BY sort_order').all();
let newOrder = 0;
for (const cat of categories) {
if (cat.id === 46) continue; // 家庭角色已经是0
if (cat.sort_order !== newOrder) {
db.prepare('UPDATE tag_categories SET sort_order = ? WHERE id = ?').run(newOrder, cat.id);
}
newOrder++;
}
console.log(` ✓ 调整了 ${newOrder} 个分类`);
// 7. 显示最终结果
console.log('\n7⃣ 最终分类列表:');
const finalCats = db.prepare('SELECT id, key, name, sort_order FROM tag_categories ORDER BY sort_order').all();
for (const cat of finalCats) {
console.log(` ${cat.sort_order + 1}. ${cat.name} (ID:${cat.id})`);
}
// 8. 统计数据
console.log('\n📊 数据统计:');
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as 总用户,
(SELECT COUNT(*) FROM tags) as 总标签,
(SELECT COUNT(*) FROM tag_categories) as 分类数,
(SELECT COUNT(*) FROM user_tags) as 总关系
`).get();
console.log(` • 总用户: ${stats.总用户}`);
console.log(` • 总标签: ${stats.总标签}`);
console.log(` • 分类数: ${stats.分类数} (从16减少到15)`);
console.log(` • 总关系: ${stats.总关系}`);
console.log('\n✅ 修复完成!\n');
} catch (e) {
console.error('❌ 错误:', e.message);
process.exit(1);
} finally {
db.close();
}

View File

@@ -0,0 +1,140 @@
const { getDb } = require('../db/init');
const db = getDb('onion');
const CATEGORY_KEY = 'basic_info_role';
const FATHER_SYNONYMS = ['父', '爸', '父 亲', '孩子爸', '爸专', '爸备'];
const GRANDPA_SYNONYMS = ['姥爷', '外爷'];
const GRANDMA_SYNONYMS = ['姥姥', '姥姥/外婆'];
const GRANDSON_SYNONYMS = [];
function updateStats(dbConn) {
const totalUsers = dbConn.prepare('SELECT COUNT(*) AS n FROM users').get().n || 1;
const tags = dbConn.prepare('SELECT id FROM tags').all();
const stmt = dbConn.prepare(`
UPDATE tags
SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
WHERE id = ?
`);
for (const tag of tags) stmt.run(totalUsers, tag.id);
}
function main() {
try {
const category = db.prepare('SELECT id FROM tag_categories WHERE key = ?').get(CATEGORY_KEY);
if (!category) throw new Error(`找不到分类: ${CATEGORY_KEY}`);
const catId = category.id;
const getTag = db.prepare('SELECT id, name FROM tags WHERE category_id = ? AND name = ?');
const renameTag = db.prepare('UPDATE tags SET name = ? WHERE id = ?');
const mergeRel = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?
`);
const deleteRel = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
const deleteTag = db.prepare('DELETE FROM tags WHERE id = ?');
const tx = db.transaction(() => {
// 1) 保证标准名存在:爸爸、外婆
let dad = getTag.get(catId, '爸爸');
const father = getTag.get(catId, '父亲');
if (!dad && father) {
renameTag.run('爸爸', father.id);
dad = { id: father.id, name: '爸爸' };
console.log('✅ 重命名: 父亲 -> 爸爸');
}
let grandma = getTag.get(catId, '外婆');
const extGrandma = getTag.get(catId, '姥姥/外婆');
if (!grandma && extGrandma) {
renameTag.run('外婆', extGrandma.id);
grandma = { id: extGrandma.id, name: '外婆' };
console.log('✅ 重命名: 姥姥/外婆 -> 外婆');
}
// 2) 合并爸爸系
dad = getTag.get(catId, '爸爸');
if (dad) {
for (const synonym of FATHER_SYNONYMS) {
const tag = getTag.get(catId, synonym);
if (!tag || tag.id === dad.id) continue;
mergeRel.run(dad.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并: ${synonym} -> 爸爸`);
}
}
// 3) 合并祖辈
const grandpa = getTag.get(catId, '爷爷');
if (grandpa) {
for (const synonym of GRANDPA_SYNONYMS) {
const tag = getTag.get(catId, synonym);
if (!tag || tag.id === grandpa.id) continue;
mergeRel.run(grandpa.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并: ${synonym} -> 爷爷`);
}
}
const grandma2 = getTag.get(catId, '奶奶');
if (grandma2) {
const tag = getTag.get(catId, '婆婆');
if (tag && tag.id !== grandma2.id) {
mergeRel.run(grandma2.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log('✅ 合并: 婆婆 -> 奶奶');
}
}
grandma = getTag.get(catId, '外婆');
if (grandma) {
for (const synonym of GRANDMA_SYNONYMS) {
const tag = getTag.get(catId, synonym);
if (!tag || tag.id === grandma.id) continue;
mergeRel.run(grandma.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并: ${synonym} -> 外婆`);
}
}
// 4) 外公系
const grandpa2 = getTag.get(catId, '外公');
if (grandpa2) {
const tag = getTag.get(catId, '姥爷');
if (tag && tag.id !== grandpa2.id) {
mergeRel.run(grandpa2.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log('✅ 合并: 姥爷 -> 外公');
}
}
});
tx();
updateStats(db);
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM tags t JOIN tag_categories c ON c.id=t.category_id WHERE c.key = ?) AS tag_count,
(SELECT COUNT(*) FROM user_tags) AS rel_count
`).get(CATEGORY_KEY);
console.log('\n✨ 标准名修复完成');
console.log(` • 家庭角色标签剩余: ${stats.tag_count}`);
console.log(` • 用户-标签关系总数: ${stats.rel_count}`);
db.close();
} catch (error) {
console.error('❌ 标准名修复失败:', error);
try { db.close(); } catch (_) {}
process.exit(1);
}
}
main();

View File

@@ -0,0 +1,80 @@
const { getDb } = require('../db/init');
const db = getDb('onion');
const CATEGORY_KEY = 'basic_info_role';
const MERGE_TO_OTHER = ['家长', '父母'];
const DELETE_ONLY = ['妻子', '女儿', '姐姐', '儿子'];
function updateStats(dbConn) {
const totalUsers = dbConn.prepare('SELECT COUNT(*) AS n FROM users').get().n || 1;
const stmt = dbConn.prepare(`
UPDATE tags
SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
WHERE id = ?
`);
const tagIds = dbConn.prepare('SELECT id FROM tags').all();
for (const tag of tagIds) stmt.run(totalUsers, tag.id);
}
function main() {
try {
const category = db.prepare('SELECT id FROM tag_categories WHERE key = ?').get(CATEGORY_KEY);
if (!category) throw new Error(`找不到分类: ${CATEGORY_KEY}`);
const catId = category.id;
const getTag = db.prepare('SELECT id, name FROM tags WHERE category_id = ? AND name = ?');
const mergeRel = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?
`);
const deleteRel = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
const deleteTag = db.prepare('DELETE FROM tags WHERE id = ?');
const other = getTag.get(catId, '其他监护人');
if (!other) throw new Error('找不到“其他监护人”标签,无法合并');
const tx = db.transaction(() => {
for (const name of MERGE_TO_OTHER) {
const tag = getTag.get(catId, name);
if (!tag || tag.id === other.id) continue;
mergeRel.run(other.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并: ${name} -> 其他监护人`);
}
for (const name of DELETE_ONLY) {
const tag = getTag.get(catId, name);
if (!tag) continue;
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`🗑️ 删除: ${name}`);
}
});
tx();
updateStats(db);
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM tags t JOIN tag_categories c ON c.id=t.category_id WHERE c.key = ?) AS tag_count,
(SELECT COUNT(*) FROM user_tags) AS rel_count
`).get(CATEGORY_KEY);
console.log('\n✨ 二次收敛完成');
console.log(` • 家庭角色标签剩余: ${stats.tag_count}`);
console.log(` • 用户-标签关系总数: ${stats.rel_count}`);
db.close();
} catch (error) {
console.error('❌ 二次收敛失败:', error);
try { db.close(); } catch (_) {}
process.exit(1);
}
}
main();

View File

@@ -0,0 +1,84 @@
/**
* 修复标签覆盖率统计
* 更新所有标签的coverage和coverage_rate字段
*/
const { getDb } = require('../db/init');
function updateTagStats(dbSuffix = 'onion') {
const db = getDb(dbSuffix);
try {
// 获取总用户数
const totalUsersRow = db.prepare('SELECT COUNT(*) as n FROM users').get();
const totalUsers = totalUsersRow.n;
if (totalUsers === 0) {
console.error('❌ 没有用户数据');
return;
}
console.log(`\n🔄 更新标签覆盖率统计(总用户数: ${totalUsers}`);
// 获取所有标签
const tags = db.prepare('SELECT id FROM tags').all();
let updated = 0;
const stmt = db.prepare(`
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
`);
for (const tag of tags) {
// 计算该标签的覆盖用户数
const coverageRow = db.prepare(`
SELECT COUNT(DISTINCT user_id) as cnt FROM user_tags WHERE tag_id = ?
`).get(tag.id);
const coverage = coverageRow.cnt || 0;
const coverage_rate = totalUsers > 0 ? +(coverage / totalUsers * 100).toFixed(2) : 0;
stmt.run(coverage, coverage_rate, tag.id);
updated++;
if (updated % 50 === 0) {
console.log(` ✓ 已更新 ${updated} 个标签...`);
}
}
console.log(`\n✅ 更新完成: ${updated} 个标签\n`);
// 显示样本
console.log('📊 样本数据前5个标签:');
const samples = db.prepare(`
SELECT id, name, coverage, coverage_rate FROM tags LIMIT 5
`).all();
for (const sample of samples) {
console.log(`${sample.name}: ${sample.coverage} users (${sample.coverage_rate}%)`);
}
// 显示统计
console.log('\n📊 整体统计:');
const stats = db.prepare(`
SELECT
MIN(coverage) as min_coverage,
MAX(coverage) as max_coverage,
ROUND(AVG(coverage), 2) as avg_coverage,
COUNT(*) as total_tags
FROM tags
`).get();
console.log(` • 总标签数: ${stats.total_tags}`);
console.log(` • 覆盖范围: ${stats.min_coverage} - ${stats.max_coverage} 用户`);
console.log(` • 平均覆盖: ${stats.avg_coverage} 用户`);
db.close();
} catch (e) {
console.error('❌ 错误:', e.message);
db.close();
process.exit(1);
}
}
// 执行更新
updateTagStats();

View File

@@ -0,0 +1,291 @@
/**
* 为清洗2.0中的所有用户生成标签
*
* 策略:对于没有标签的用户,基于其他列的值生成标签
* - 用户年龄段标签 <- 年龄列4
* - 孩子学段标签 <- 年级列7
* - 教育风险标签 <- 综合判断
* 等
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb } = require('../db/init');
async function main() {
try {
console.log('\n╔════════════════════════════════════════════════════════════════╗');
console.log('║ 🏷️ 生成缺失的标签数据 ║');
console.log('╚════════════════════════════════════════════════════════════════╝\n');
const db = getDb('onion');
// 读取清洗2.0 找出缺失标签的用户
console.log('📖 读取清洗2.0.xlsx...');
const wb = new ExcelJS.Workbook();
await wb.xlsx.readFile(path.join(__dirname, '../清洗2.0.xlsx'));
const ws = wb.worksheets[0];
// 标签生成规则
const TAG_GENERATORS = {
user_age_group: (row) => {
// 列4年龄
const age = parseInt(row.values[4]);
if (!age || isNaN(age)) return null;
if (age < 30) return '年轻(25-35岁)';
if (age < 45) return '中青年(35-45岁)';
if (age < 55) return '中年(45-55岁)';
if (age < 65) return '中老年(55-65岁)';
return '低龄老年(65-75岁)';
},
child_grade: (row) => {
// 列7年级
const grade = row.values[7];
if (!grade) return null;
const gradeStr = String(grade).toLowerCase();
if (gradeStr.includes('幼') || gradeStr.includes('小学')) {
if (gradeStr.includes('低')) return '小学低段(1-3年级)';
return '小学高段(4-6年级)';
}
if (gradeStr.includes('初')) return '初中前期(初一初二)';
if (gradeStr.includes('高')) return '高中前期(高一高二)';
return '小学高段(4-6年级)';
},
family_structure: (row) => {
// 列1家庭角色, 列5家庭角色_2
const role = String(row.values[1] || '');
const role2 = String(row.values[5] || '');
const hasGrandparents = role.includes('祖') || role.includes('外祖') || role2.includes('祖') || role2.includes('外祖');
const isSingleParent = role.includes('单亲') || role.includes('离异');
if (isSingleParent) return '离异家庭+隔代抚养-双重风险';
if (hasGrandparents) return '三代同堂-传统大家庭';
return '核心家庭-父母直接养育';
},
education_risk: (row) => {
// 综合判断:综合多个因素
const score = [
(String(row.values[8] || '').includes('差') ? 3 : 0), // 学习成绩
(String(row.values[12] || '').includes('是') ? 2 : 0), // 否定孩子
(String(row.values[13] || '').includes('是') ? 3 : 0) // 打骂教育
].reduce((a, b) => a + b, 0);
if (score >= 5) return '高风险(5分)';
if (score >= 3) return '中高风险(3分)';
return '低风险(1分)';
},
family_support: (row) => {
// 亲子关系、有无分歧
const relation = String(row.values[10] || '');
const divergence = String(row.values[11] || '');
const score = [
(relation.includes('良好') ? 2 : 0),
(divergence.includes('是') ? -1 : 1)
].reduce((a, b) => a + b, 0);
if (score >= 2) return '高支持度(5分)';
if (score >= 1) return '中等支持度(3分)';
return '低支持度(2分)';
},
payment_ability: (row) => {
// 职业、年龄(推断收入)
const profession = String(row.values[3] || '');
const education = String(row.values[2] || '');
const highProf = ['医', '律', '教授', '总监', '经理', '总经理', 'CFO'].some(x => profession.includes(x));
const highEdu = education.includes('硕') || education.includes('博');
if (highProf || highEdu) return '高付费能力(4分)';
if (profession.includes('企业') || profession.includes('工程')) return '中等付费能力(0分)';
return '基础付费能力(-2分)';
},
urgency: (row) => {
// 学习成绩、手机依赖等
const score = String(row.values[8] || '');
const behavior = [row.values[12], row.values[13]].map(x => String(x)).join('');
if (behavior.match(/打|责|否定/)) return '高度紧急(6分)';
if (score.includes('差')) return '轻度紧急(1分)';
return '常规咨询(0分)';
},
core_problem: (row) => {
// 问题描述列16
const desc = String(row.values[16] || '');
if (!desc) return '问题描述不足-需深入了解';
if (desc.includes('成绩')) return '【学业】成绩下滑';
if (desc.includes('游戏') || desc.includes('手机')) return '【行为】手机/游戏依赖';
if (desc.includes('关系')) return '【关系】亲子冲突严重';
return '【学业】成绩下滑';
},
intervention_difficulty: (row) => {
// 家庭角色分散、教育不当
const roles = [row.values[1], row.values[5]].map(x => String(x)).join('|');
const education = String([row.values[12], row.values[13]].join(''));
const score = [
(roles.split('|').length > 1 ? 2 : 0),
(education.includes('是') ? 3 : 0)
].reduce((a, b) => a + b, 0);
if (score >= 4) return '极高难度(10分)';
if (score >= 2) return '中等难度(4分)';
return '较低难度(2分)';
},
conversion_priority: (row) => {
// 综合优先级
const grade = String(row.values[7] || '');
const highPriority = grade.includes('高中');
return highPriority ? 'B级优先(50分)' : 'C级优先(49分)';
},
channel_adaption: (row) => {
// 年龄推断沟通渠道
const age = parseInt(row.values[4]);
if (age && age > 55) return '电话跟进优先 > 子女协助转化 > 微信语音';
return '微信私域 > 电话跟进 > 朋友圈';
},
product_match: (row) => {
// 学段匹配产品
const grade = String(row.values[7] || '');
if (grade.includes('高中')) return '高考压力疏导 + 厌学干预方案';
if (grade.includes('初中')) return '青春期应对方案 + 学习动力激活';
return '习惯养成课程 + 亲子沟通指导';
},
service_duration: (row) => {
// 问题严重程度推断周期
const desc = String(row.values[16] || '');
if (desc.includes('休学') || desc.includes('辍学')) return '长周期(180天)';
return '标准周期(60天)';
}
};
// 获取分类ID映射
const catIdMap = {};
const categories = db.prepare('SELECT id, key FROM tag_categories').all();
for (const cat of categories) {
catIdMap[cat.key] = cat.id;
}
console.log('');
// 对每一行生成标签
let generated = 0;
let inserted = 0;
const tagCache = {};
const insertTagStmt = db.prepare(`
INSERT OR IGNORE INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
VALUES (?, ?, ?, 0, 0, 0)
`);
const getTagIdStmt = db.prepare(`
SELECT id FROM tags WHERE category_id = ? AND name = ?
`);
const getOrCreateUserTagStmt = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
ws.eachRow((row, rowNum) => {
if (rowNum === 1) return; // skip header
// 获取用户
const userKey = `user_${rowNum}`;
const user = db.prepare('SELECT id FROM users WHERE uid = ?').get(userKey);
if (!user) return;
// 对每个分类尝试生成标签
for (const [catKey, generator] of Object.entries(TAG_GENERATORS)) {
try {
const tagValue = generator(row);
if (!tagValue) continue;
const catId = catIdMap[catKey];
if (!catId) continue;
// 检查用户是否已有该分类的标签
const existing = db.prepare(`
SELECT COUNT(*) as cnt FROM user_tags ut
JOIN tags t ON ut.tag_id = t.id
WHERE ut.user_id = ? AND t.category_id = ?
`).get(user.id, catId);
if (existing.cnt > 0) continue; // 跳过已有标签的
// 创建或获取标签
const cacheKey = `${catId}:${tagValue}`;
let tagId = tagCache[cacheKey];
if (!tagId) {
let tag = getTagIdStmt.get(catId, tagValue);
if (!tag) {
insertTagStmt.run(
`${catKey}_${Math.random().toString(36).slice(2)}`,
tagValue,
catId
);
tag = getTagIdStmt.get(catId, tagValue);
}
tagId = tag?.id;
if (tagId) tagCache[cacheKey] = tagId;
}
if (tagId) {
getOrCreateUserTagStmt.run(user.id, tagId);
inserted++;
}
} catch (e) {
// 跳过生成失败的标签
}
}
generated++;
if (generated % 500 === 0) {
console.log(` ✓ 已处理 ${generated} 行...`);
}
});
console.log(`\n✅ 标签生成完成:`);
console.log(` • 处理用户: ${generated}`);
console.log(` • 新增标签链接: ${inserted}`);
// 显示统计
console.log('\n📊 最终标签分布:');
const tagStats = db.prepare(`
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.id
LIMIT 16
`).all();
for (const stat of tagStats) {
const coverage = stat.user_count ? Math.round((stat.user_count / 1929) * 100) : 0;
console.log(`${stat.name.padEnd(20)}: ${stat.tag_count} tags, ${stat.user_count || 0} users (${coverage}%)`);
}
db.close();
} catch (e) {
console.error('❌ Error:', e.message);
console.error(e);
process.exit(1);
}
}
main();

View File

@@ -0,0 +1,382 @@
/**
* 新数据导入脚本 v4.0
* 基于"清洗2.0.xlsx"的完整数据导入
*
* 特点:
* - 导入1956行用户数据
* - 直接使用清洗2.0中的预生成标签第17-31列
* - 创建16个标签分类
*
* 用法: node scripts/import-clean-data-v2.js
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗2.0.xlsx');
// ════════════════════════════════════════════════════════════════════════════
// 标签分类定义 - 16个分类
// ════════════════════════════════════════════════════════════════════════════
const TAG_CATEGORIES = [
{
key: 'basic_info_role',
name: '家庭角色',
color: '#d97706'
},
{
key: 'user_age_group',
name: '用户年龄段标签',
color: '#6366f1'
},
{
key: 'child_grade',
name: '孩子学段标签',
color: '#8b5cf6'
},
{
key: 'family_structure',
name: '家庭结构标签',
color: '#a78bfa'
},
{
key: 'education_risk',
name: '教育风险标签',
color: '#c084fc'
},
{
key: 'family_support',
name: '家庭支持度标签',
color: '#ec4899'
},
{
key: 'payment_ability',
name: '付费能力标签',
color: '#f472b6'
},
{
key: 'urgency',
name: '需求紧迫度标签',
color: '#f97316'
},
{
key: 'core_problem',
name: '核心问题标签',
color: '#06b6d4'
},
{
key: 'intervention_difficulty',
name: '干预难度标签',
color: '#0891b2'
},
{
key: 'conversion_priority',
name: '转化优先级标签',
color: '#10b981'
},
{
key: 'channel_adaption',
name: '渠道适配标签',
color: '#059669'
},
{
key: 'product_match',
name: '产品匹配标签',
color: '#f59e0b'
},
{
key: 'basic_info_education',
name: '文化程度',
color: '#dc2626'
},
{
key: 'service_duration',
name: '服务周期标签',
color: '#7c3aed'
}
];
// ════════════════════════════════════════════════════════════════════════════
// 列数据映射清洗2.0.xlsx
// ════════════════════════════════════════════════════════════════════════════
const COLUMN_MAPPING = {
// 基础数据列1-16
family_role: 1, // 家庭角色
education: 2, // 文化程度
profession: 3, // 职业
age: 4, // 年龄
family_role_2: 5, // 家庭角色_2
child_gender: 6, // 性别
child_grade: 7, // 年级
academic_score: 8, // 学习成绩
family_situation: 9, // 家庭基本情况
parent_child_rel: 10, // 亲子关系
education_divergence: 11, // 家长有无教育分歧
negate_child: 12, // 是否经常否定孩子
physical_punishment: 13, // 有无打骂教育
child_with_parents: 14, // 孩子是否在父母身边长大
caregivers: 15, // 还有谁参与孩子的养育
child_situation: 16, // 孩子目前情况的描述
// 预生成标签列17-31
service_days: 17, // 天数(不是标签,是数值)
user_identity: 18, // 用户身份标签
user_age: 19, // 用户年龄段标签
child_grade_tag: 20, // 孩子学段标签
family_struct_tag: 21, // 家庭结构标签
education_risk: 22, // 教育风险标签
family_support: 23, // 家庭支持度标签
payment_ability: 24, // 付费能力标签
urgency: 25, // 需求紧迫度标签
core_problem: 26, // 核心问题标签
intervention_diff: 27, // 干预难度标签
conversion_priority: 28, // 转化优先级标签
channel_adaption: 29, // 渠道适配标签
product_match: 30, // 产品匹配标签
service_duration: 31 // 服务周期标签
};
// ════════════════════════════════════════════════════════════════════════════
// 主程序
// ════════════════════════════════════════════════════════════════════════════
async function main() {
console.log('\n');
console.log('╔════════════════════════════════════════════════════════════════╗');
console.log('║ 📥 清洗2.0.xlsx 数据导入程序 v4.0 ║');
console.log('╚════════════════════════════════════════════════════════════════╝');
console.log('');
try {
// 初始化数据库
console.log('🔧 初始化数据库...');
initializeDatabase();
const db = getDb('onion');
// 清除旧数据
console.log('🗑️ 清除旧数据...');
db.prepare('DELETE FROM user_tags').run();
db.prepare('DELETE FROM users').run();
db.prepare('DELETE FROM tags').run();
db.prepare('DELETE FROM tag_categories').run();
// 创建分类
console.log('📂 创建标签分类...');
const insertCategoryStmt = db.prepare(`
INSERT INTO tag_categories (key, name, color, sort_order)
VALUES (?, ?, ?, ?)
`);
const categoryMap = {};
TAG_CATEGORIES.forEach((cat, idx) => {
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
categoryMap[cat.key] = result.lastInsertRowid;
});
console.log(` ✅ 创建 ${TAG_CATEGORIES.length} 个分类\n`);
// 读取Excel文件
console.log('📖 读取Excel文件...');
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.worksheets[0];
console.log(` • 工作表: ${worksheet.name}`);
console.log(` • 行数: ${worksheet.rowCount}`);
console.log(` • 列数: ${worksheet.columnCount}\n`);
// 准备SQL语句
const insertUserStmt = db.prepare(`
INSERT INTO users (uid, name, extra_json)
VALUES (?, ?, ?)
`);
const insertTagStmt = db.prepare(`
INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
VALUES (?, ?, ?, 0, 0, 0)
`);
const insertUserTagStmt = db.prepare(`
INSERT INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
// 标签缓存
const tagCache = {};
function getOrCreateTag(catKey, tagName) {
if (!tagName || String(tagName).trim() === '') return null;
const normalizedName = String(tagName).trim();
const cacheKey = `${catKey}:${normalizedName}`;
if (tagCache[cacheKey]) {
return tagCache[cacheKey];
}
// 使用name-based lookup
let tag = db.prepare(`
SELECT id FROM tags WHERE category_id = ? AND name = ?
`).get(categoryMap[catKey], normalizedName);
if (!tag) {
const result = insertTagStmt.run(
`${catKey}_${Math.random().toString(36).slice(2)}`,
normalizedName,
categoryMap[catKey]
);
tag = { id: result.lastInsertRowid };
}
tagCache[cacheKey] = tag.id;
return tag.id;
}
// 导入数据
console.log('📝 导入用户数据...\n');
let insertedCount = 0;
let rowCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return; // 跳过标题行
rowCount++;
const values = row.values;
if (!values[COLUMN_MAPPING.family_role]) {
if (rowCount <= 5) {
console.warn(`⚠️ 行 ${rowNumber} 缺少家庭角色,跳过`);
}
return;
}
// 创建用户
const uid = `user_${rowCount}`;
const extraData = {
row: rowNumber,
days: values[COLUMN_MAPPING.service_days] || 0
};
const result = insertUserStmt.run(uid, uid, JSON.stringify(extraData));
if (result.changes > 0) {
insertedCount++;
const userId = result.lastInsertRowid;
// 添加标签:基础信息
const role = values[COLUMN_MAPPING.family_role];
if (role) {
const tagId = getOrCreateTag('basic_info_role', role);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
const education = values[COLUMN_MAPPING.education];
if (education) {
const tagId = getOrCreateTag('basic_info_education', education);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 添加标签预生成标签列列18-31
const tagColumns = [
['user_identity', COLUMN_MAPPING.user_identity],
['user_age_group', COLUMN_MAPPING.user_age],
['child_grade', COLUMN_MAPPING.child_grade_tag],
['family_structure', COLUMN_MAPPING.family_struct_tag],
['education_risk', COLUMN_MAPPING.education_risk],
['family_support', COLUMN_MAPPING.family_support],
['payment_ability', COLUMN_MAPPING.payment_ability],
['urgency', COLUMN_MAPPING.urgency],
['core_problem', COLUMN_MAPPING.core_problem],
['intervention_difficulty', COLUMN_MAPPING.intervention_diff],
['conversion_priority', COLUMN_MAPPING.conversion_priority],
['channel_adaption', COLUMN_MAPPING.channel_adaption],
['product_match', COLUMN_MAPPING.product_match],
['service_duration', COLUMN_MAPPING.service_duration]
];
tagColumns.forEach(([catKey, colIdx]) => {
const tagValue = values[colIdx];
if (tagValue && String(tagValue).trim() !== '') {
const tagId = getOrCreateTag(catKey, tagValue);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
});
if (rowCount % 100 === 0) {
console.log(` ✓ 已处理 ${rowCount} 行...`);
}
}
});
console.log(`\n✅ 用户导入完成:${insertedCount}\n`);
// 更新标签统计
console.log('🔄 更新标签统计...');
updateTagStats(db);
// 显示统计
console.log('\n📊 数据统计:');
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories,
(SELECT COUNT(*) FROM user_tags) as total_relationships
`).get();
console.log(` • 总用户: ${stats.total_users}`);
console.log(` • 总标签: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
console.log(` • 用户-标签关系: ${stats.total_relationships}`);
// 显示分类统计
console.log('\n分类覆盖统计');
const catStats = db.prepare(`
SELECT tc.name, COUNT(t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.id
`).all();
catStats.forEach(stat => {
const coverage = stats.total_users > 0 ? ((stat.user_count || 0) * 100 / stats.total_users).toFixed(1) : 0;
console.log(`${stat.name}: ${stat.tag_count || 0} 标签, ${stat.user_count || 0} 用户 (${coverage}%)`);
});
db.close();
console.log('\n🎉 导入流程完成!\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
process.exit(1);
}
}
function updateTagStats(db) {
const updateStmt = db.prepare(`
UPDATE tags
SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND(
(SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 /
(SELECT COUNT(*) FROM users),
2
)
WHERE id = ?
`);
const allTags = db.prepare('SELECT id FROM tags').all();
allTags.forEach(tag => {
updateStmt.run(tag.id);
});
}
// 执行主程序
main();

View File

@@ -0,0 +1,448 @@
/**
* 清洗3.0 导入脚本 v1.0
*
* 业务约束:
* 1) 参加指导最想解决 缺失时采用保守推断,标签后缀“(推断)”
* 2) 监护人2相关字段不参与建模
* 3) 删除付费能力标签分类
* 4) 全量替换导入
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗3.0.xlsx');
const DB_THEME = 'onion';
const TOTAL_USERS_FALLBACK = 11500;
const TAG_CATEGORIES = [
{ key: 'basic_info_role', name: '家庭角色', color: '#d97706' },
{ key: 'user_age_group', name: '用户年龄段标签', color: '#6366f1' },
{ key: 'child_grade', name: '孩子学段标签', color: '#8b5cf6' },
{ key: 'family_structure', name: '家庭结构标签', color: '#a78bfa' },
{ key: 'education_risk', name: '教育风险标签', color: '#c084fc' },
{ key: 'family_support', name: '家庭支持度标签', color: '#ec4899' },
{ key: 'urgency', name: '需求紧迫度标签', color: '#f97316' },
{ key: 'core_problem', name: '核心问题标签', color: '#06b6d4' },
{ key: 'intervention_difficulty', name: '干预难度标签', color: '#0891b2' },
{ key: 'conversion_priority', name: '转化优先级标签', color: '#10b981' },
{ key: 'channel_adaption', name: '渠道适配标签', color: '#059669' },
{ key: 'product_match', name: '产品匹配标签', color: '#f59e0b' },
{ key: 'basic_info_education', name: '文化程度', color: '#dc2626' },
{ key: 'service_duration', name: '服务周期标签', color: '#7c3aed' }
];
function text(v) {
if (v === undefined || v === null) return '';
return String(v).replace(/\s+/g, ' ').trim();
}
function parseNumber(v) {
if (v === undefined || v === null || v === '') return null;
const raw = String(v).replace(/[^\d.\-]/g, '');
if (!raw) return null;
const n = Number(raw);
return Number.isFinite(n) ? n : null;
}
function splitMulti(v) {
const s = text(v);
if (!s) return [];
return s
.split(/[、,;/|]+/)
.map((item) => item.trim())
.filter(Boolean);
}
function normalizeFamilyAtmosphere(v) {
const s = text(v);
if (!s) return '中立';
const warm = ['和谐', '温暖', '支持', '理解', '亲密', '关心', '融洽', '良好'];
const cold = ['冷漠', '疏离', '冷战', '忽视', '回避', '压抑', '隔阂'];
const conflict = ['争吵', '冲突', '矛盾', '紧张', '对立', '不和'];
const neutral = ['一般', '普通', '还行', '尚可', '平常'];
const hit = (dict) => dict.some((k) => s.includes(k));
if (hit(cold) || hit(conflict)) return '冷漠';
if (hit(warm)) return '温暖';
if (hit(neutral)) return '中立';
return '中立';
}
function normalizeParentChild(v) {
const s = text(v);
if (!s) return '中立';
if (/(紧张|疏离|冲突|差|糟)/.test(s)) return '紧张';
if (/(良好|亲密|和谐|较好|很好)/.test(s)) return '良好';
return '中立';
}
function normalizeRole(v) {
const s = text(v);
if (!s) return '';
if (/(妈妈|母亲|妈咪)/.test(s)) return '妈妈';
if (/(爸爸|父亲)/.test(s)) return '父亲';
if (/(奶奶|祖母)/.test(s)) return '奶奶';
if (/(爷爷|祖父)/.test(s)) return '爷爷';
if (/(姥姥|外婆)/.test(s)) return '姥姥/外婆';
return s;
}
function ageToTag(age) {
if (age == null) return '';
if (age < 25) return '25岁以下';
if (age < 35) return '25-34岁';
if (age < 45) return '35-44岁';
if (age < 55) return '45-54岁';
return '55岁及以上';
}
function normalizeGrade(v) {
const s = text(v);
if (!s) return '';
if (/幼/.test(s)) return '幼儿园';
if (/(小|一年级|二年级|三年级|四年级|五年级|六年级)/.test(s)) return '小学';
if (/(初一|初二|初三|初中)/.test(s)) return '初中';
if (/(高一|高二|高三|高中)/.test(s)) return '高中';
if (/(大学|大一|大二|大三|大四)/.test(s)) return '大学';
return s;
}
function normalizeScore(v) {
const s = text(v);
if (!s) return '一般';
if (/(优秀|优异|很好|拔尖)/.test(s)) return '优秀';
if (/(良好|较好|不错)/.test(s)) return '良好';
if (/(差|不理想|偏下|落后|薄弱)/.test(s)) return '较差';
return '一般';
}
function inferCoreProblem(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const atmosphere = normalizeFamilyAtmosphere(row['家庭氛围']);
const relation = normalizeParentChild(row['亲子关系']);
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
const majorEvent = text(row['重大影响事件_扩展']);
if (score === '较差') return '学习动力与执行(推断)';
if (/(有|是|存在|经常)/.test(negate) || /(有|是|存在|经常)/.test(physical)) {
return '教养方式调整(推断)';
}
if (atmosphere === '冷漠' || relation === '紧张' || /(有|是|分歧)/.test(divergence)) {
return '亲子沟通修复(推断)';
}
if (/(离异|变故|创伤|重大)/.test(majorEvent)) {
return '情绪与安全感支持(推断)';
}
return '阶段性成长支持(推断)';
}
function inferEducationRisk(row) {
const risk = [];
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
const withParents = text(row['孩子是否在父母身边长大']);
if (/(有|是|分歧|不一致)/.test(divergence)) risk.push('教育理念分歧');
if (/(有|是|经常|总是)/.test(negate)) risk.push('否定式沟通风险');
if (/(有|是|打|骂|体罚)/.test(physical)) risk.push('惩罚式教育风险');
if (/(否|不在|老人|寄养|留守)/.test(withParents)) risk.push('陪伴不足风险');
return risk;
}
function inferFamilyStructure(row) {
const tags = [];
const basic = text(row['家庭基本情况_规范'] || row['家庭基本情况']);
const withParents = text(row['孩子是否在父母身边长大']);
const caregivers = text(row['还有谁参与孩子的养育']);
if (/单亲|离异/.test(basic)) tags.push('单亲家庭');
if (/重组/.test(basic)) tags.push('重组家庭');
if (/三代同堂|隔代|祖/.test(basic) || /爷爷|奶奶|姥姥|外婆|祖/.test(caregivers)) tags.push('隔代参与家庭');
if (/(否|不在|寄养|留守)/.test(withParents)) tags.push('分离养育家庭');
if (!tags.length) tags.push('常规家庭结构');
return tags;
}
function inferUrgency(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const relation = normalizeParentChild(row['亲子关系']);
const physical = text(row['有无打骂教育']);
if (score === '较差' || relation === '紧张' || /(有|是|打|骂)/.test(physical)) return '高紧迫度';
if (score === '一般') return '中紧迫度';
return '低紧迫度';
}
function inferInterventionDifficulty(row) {
let score = 0;
const relation = normalizeParentChild(row['亲子关系']);
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
if (relation === '紧张') score += 2;
if (/(有|是|分歧)/.test(divergence)) score += 1;
if (/(有|是|经常)/.test(negate)) score += 1;
if (/(有|是|打|骂)/.test(physical)) score += 2;
if (score >= 4) return '高干预难度';
if (score >= 2) return '中干预难度';
return '低干预难度';
}
function inferConversionPriority(row) {
const urgency = inferUrgency(row);
const diff = inferInterventionDifficulty(row);
if (urgency === '高紧迫度' && diff !== '高干预难度') return '高优先级';
if (urgency === '高紧迫度' || diff === '中干预难度') return '中优先级';
return '低优先级';
}
function inferChannelAdaption(row) {
const q = text(row['问卷评估']);
if (!q) return '标准沟通';
if (/(线上|微信|视频)/.test(q)) return '线上沟通优先';
if (/(线下|到访|面谈)/.test(q)) return '线下面谈优先';
return '标准沟通';
}
function inferProductMatch(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const relation = normalizeParentChild(row['亲子关系']);
if (score === '较差' && relation === '紧张') return '综合干预方案';
if (score === '较差') return '学习提升方案';
if (relation === '紧张') return '亲子沟通方案';
return '成长支持方案';
}
function inferServiceDuration(row) {
const urgency = inferUrgency(row);
const difficulty = inferInterventionDifficulty(row);
if (urgency === '高紧迫度' || difficulty === '高干预难度') return '12周';
if (urgency === '中紧迫度') return '8周';
return '4周';
}
function updateTagStats(db) {
const totalUsers = db.prepare('SELECT COUNT(*) as n FROM users').get().n || TOTAL_USERS_FALLBACK;
const allTags = db.prepare('SELECT id FROM tags').all();
const stmt = db.prepare(`
UPDATE tags SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
WHERE id = ?
`);
allTags.forEach((t) => stmt.run(totalUsers, t.id));
}
async function main() {
console.log('\n🚀 清洗3.0 导入流程 v1.0\n');
initializeDatabase(DB_THEME);
const db = getDb(DB_THEME);
try {
db.pragma('foreign_keys = OFF');
console.log('🗑️ 清空旧数据...');
db.prepare('DELETE FROM user_tags').run();
db.prepare('DELETE FROM users').run();
db.prepare('DELETE FROM tags').run();
db.prepare('DELETE FROM tag_categories').run();
const categoryMap = {};
const insertCategoryStmt = db.prepare(`
INSERT INTO tag_categories (key, name, color, sort_order)
VALUES (?, ?, ?, ?)
`);
TAG_CATEGORIES.forEach((cat, idx) => {
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
categoryMap[cat.key] = result.lastInsertRowid;
});
console.log(`✅ 已创建 ${TAG_CATEGORIES.length} 个分类(已删除付费能力)`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.worksheets[0];
console.log(`📖 读取 ${worksheet.name} | 行: ${worksheet.rowCount} | 列: ${worksheet.columnCount}`);
const headerRow = worksheet.getRow(1);
const headers = {};
headerRow.eachCell((cell, colNumber) => {
headers[text(cell.value)] = colNumber;
});
const needHeaders = [
'家庭角色', '文化程度', '年龄_数值', '年龄_2_数值', '年级_规范', '学习成绩_规范',
'家庭基本情况_规范', '家庭氛围', '亲子关系', '家长有无教育分歧', '是否经常否定孩子',
'有无打骂教育', '孩子是否在父母身边长大', '还有谁参与孩子的养育',
'重大影响事件_扩展', '参加指导最想解决_扩展', '问卷评估', '文件名称'
];
const missing = needHeaders.filter((h) => !headers[h]);
if (missing.length) {
throw new Error(`缺少关键表头: ${missing.join(', ')}`);
}
const insertUserStmt = db.prepare('INSERT INTO users (uid, name, extra_json) VALUES (?, ?, ?)');
const insertTagStmt = db.prepare('INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order) VALUES (?, ?, ?, 0, 0, 0)');
const insertUserTagStmt = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
const tagCache = new Map();
function getOrCreateTag(catKey, tagName) {
const n = text(tagName);
if (!n) return null;
const cacheKey = `${catKey}:${n}`;
if (tagCache.has(cacheKey)) return tagCache.get(cacheKey);
let tag = db.prepare('SELECT id FROM tags WHERE category_id = ? AND name = ?').get(categoryMap[catKey], n);
if (!tag) {
const key = `${catKey}_${Math.random().toString(36).slice(2, 10)}`;
const result = insertTagStmt.run(key, n, categoryMap[catKey]);
tag = { id: result.lastInsertRowid };
}
tagCache.set(cacheKey, tag.id);
return tag.id;
}
let rowCount = 0;
let inserted = 0;
let inferredCoreCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return;
rowCount += 1;
const rowObj = {};
for (const [name, idx] of Object.entries(headers)) {
rowObj[name] = row.getCell(idx).value;
}
const role = normalizeRole(rowObj['家庭角色']);
if (!role) return;
const fileName = text(rowObj['文件名称']);
const safeFileName = fileName.replace(/\s+/g, '_').slice(0, 60);
const uid = fileName ? `u_${safeFileName}_${rowNumber}` : `u_row_${rowNumber}`;
const userExtra = {
rowNumber,
inferredCore: false,
source: 'clean3.0'
};
const result = insertUserStmt.run(uid, uid, JSON.stringify(userExtra));
if (!result.changes) return;
inserted += 1;
const userId = result.lastInsertRowid;
const addTag = (catKey, tagName) => {
const tagId = getOrCreateTag(catKey, tagName);
if (tagId) insertUserTagStmt.run(userId, tagId);
};
// 基础标签
addTag('basic_info_role', role);
addTag('basic_info_education', text(rowObj['文化程度']));
// 年龄段监护人1 + 监护人2数值年龄合并但不使用监护人2其他字段
const age1 = parseNumber(rowObj['年龄_数值']);
const age2 = parseNumber(rowObj['年龄_2_数值']);
addTag('user_age_group', ageToTag(age1));
addTag('user_age_group', ageToTag(age2));
// 学段
addTag('child_grade', normalizeGrade(rowObj['年级_规范']));
// 家庭结构
inferFamilyStructure(rowObj).forEach((t) => addTag('family_structure', t));
// 教育风险
inferEducationRisk(rowObj).forEach((t) => addTag('education_risk', t));
// 家庭支持度3类氛围 + 亲子关系)
addTag('family_support', `家庭氛围-${normalizeFamilyAtmosphere(rowObj['家庭氛围'])}`);
addTag('family_support', `亲子关系-${normalizeParentChild(rowObj['亲子关系'])}`);
// 紧迫度、难度、优先级
addTag('urgency', inferUrgency(rowObj));
addTag('intervention_difficulty', inferInterventionDifficulty(rowObj));
addTag('conversion_priority', inferConversionPriority(rowObj));
// 渠道/产品/周期
addTag('channel_adaption', inferChannelAdaption(rowObj));
addTag('product_match', inferProductMatch(rowObj));
addTag('service_duration', inferServiceDuration(rowObj));
// 核心问题:优先原始扩展,否则保守推断 + (推断)
const originCore = splitMulti(rowObj['参加指导最想解决_扩展']);
if (originCore.length) {
originCore.forEach((tag) => addTag('core_problem', tag));
} else {
const inferred = inferCoreProblem(rowObj);
addTag('core_problem', inferred);
inferredCoreCount += 1;
}
if (rowCount % 500 === 0) {
console.log(` ✓ 已处理 ${rowCount}`);
}
});
console.log(`\n✅ 导入用户: ${inserted}`);
console.log(`✅ 核心问题推断数: ${inferredCoreCount}`);
updateTagStats(db);
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories,
(SELECT COUNT(*) FROM user_tags) as total_rels
`).get();
console.log('\n📊 结果统计');
console.log(` • 用户数: ${stats.total_users}`);
console.log(` • 标签数: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
console.log(` • 关系数: ${stats.total_rels}`);
const deletedPayment = db.prepare('SELECT COUNT(*) as n FROM tag_categories WHERE key = ?').get('payment_ability').n;
console.log(` • 付费能力分类存在数: ${deletedPayment}`);
const inferredTags = db.prepare(`
SELECT COUNT(*) as n FROM tags t
JOIN tag_categories c ON c.id = t.category_id
WHERE c.key = 'core_problem' AND t.name LIKE '%(推断)'
`).get().n;
console.log(` • 推断核心问题标签种类: ${inferredTags}`);
db.pragma('foreign_keys = ON');
db.close();
console.log('\n🎉 清洗3.0导入完成\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
try { db.close(); } catch (_) {}
process.exit(1);
}
}
main();

View File

@@ -0,0 +1,673 @@
/**
* 新数据导入脚本 v3.0
* 基于"清洗1.0.xlsx"的完整标签体系
*
* 标签体系49个标签分为5个维度
* 用法: node scripts/import-clean-data.js
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗1.0.xlsx');
// ════════════════════════════════════════════════════════════════════════════
// 标签分类定义 v3.0 - 49个标签 5个维度
// ════════════════════════════════════════════════════════════════════════════
const TAG_CATEGORIES = [
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第一维度:监护人信息 (19个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'guardian_role',
name: '监护人身份',
color: '#3b82f6',
columns: [1], // A: 家庭角色
type: 'discrete'
},
{
key: 'guardian_education',
name: '文化程度',
color: '#6366f1',
columns: [2], // B: 文化程度
type: 'discrete'
},
{
key: 'guardian_occupation',
name: '职业与经济地位',
color: '#8b5cf6',
columns: [3], // C: 职业
type: 'discrete'
},
{
key: 'guardian_age_group',
name: '监护人年龄段',
color: '#a78bfa',
columns: [4], // D: 年龄
type: 'continuous'
},
{
key: 'second_guardian_role',
name: '第二监护人身份',
color: '#c084fc',
columns: [5], // E: 家庭角色_2
type: 'discrete'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第二维度:孩子信息 (13个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'child_gender',
name: '孩子性别',
color: '#ec4899',
columns: [6], // F: 性别
type: 'discrete'
},
{
key: 'child_grade',
name: '孩子学段',
color: '#f472b6',
columns: [7], // G: 年级
type: 'discrete'
},
{
key: 'child_academic_score',
name: '学习成绩',
color: '#f97316',
columns: [8], // H: 学习成绩
type: 'discrete'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第三维度:家庭环境 (8个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'family_structure',
name: '家庭结构',
color: '#06b6d4',
columns: [9], // I: 家庭基本情况
type: 'keyword_extract',
keywords: ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家']
},
{
key: 'parent_child_relationship',
name: '亲子关系',
color: '#0891b2',
columns: [10], // J: 亲子关系
type: 'text'
},
{
key: 'child_living_with_parents',
name: '与父母同住情况',
color: '#10b981',
columns: [14], // N: 孩子是否在父母身边长大
type: 'yes_no'
},
{
key: 'child_caregivers',
name: '参与养育人员',
color: '#059669',
columns: [15], // O: 还有谁参与孩子的养育
type: 'text'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第四维度:教育风险 (6个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'education_consensus',
name: '教育理念一致性',
color: '#f59e0b',
columns: [11], // K: 家长有无教育分歧
type: 'yes_no'
},
{
key: 'child_negation',
name: '否定孩子情况',
color: '#d97706',
columns: [12], // L: 是否经常否定孩子
type: 'yes_no'
},
{
key: 'physical_punishment',
name: '打骂教育',
color: '#dc2626',
columns: [13], // M: 有无打骂教育
type: 'yes_no'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第五维度:服务方案 (3个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'service_duration',
name: '服务周期',
color: '#7c3aed',
columns: [17], // Q: 天数
type: 'discrete'
}
];
// 值映射与标准化规则
const TAG_MAPPINGS = {
'guardian_role': {
'母亲': '母亲',
'妈妈': '母亲',
'母': '母亲',
'父亲': '父亲',
'爸爸': '父亲',
'奶奶': '祖母',
'祖母': '祖母',
'爷爷': '祖父',
'外婆': '外祖母',
'外公': '外祖父',
'姥姥': '外祖母',
'姥爷': '外祖父',
'舅舅': '其他亲属',
'妻子': '其他亲属',
'大姐': '其他亲属'
},
'guardian_education': {
'初小': '小学',
'小学': '小学',
'初中': '初中',
'中师': '中专',
'中专': '中专',
'高中': '高中',
'大专': '大专',
'大学': '本科',
'本科': '本科',
'大学本科': '本科',
'硕士': '硕士及以上',
'研究生': '硕士及以上',
'在职研究生': '硕士及以上'
},
'child_gender': {
'男': '男孩',
'女': '女孩',
'女、男': '双胞胎'
},
'child_academic_score': {
'优秀': '优秀',
'良好': '良好',
'一般': '一般',
'差': '较差'
},
'child_living_with_parents': {
'是': '是',
'是的': '是',
'在': '是',
'否': '否',
'没有': '否',
'不是': '否'
},
'education_consensus': {
'有': '有分歧',
'是': '有分歧',
'否': '无分歧',
'无': '无分歧',
'没有': '无分歧'
},
'child_negation': {
'是': '是',
'有': '是',
'是的': '是',
'经常': '是',
'否': '否',
'无': '否',
'没有': '否',
'偶尔': '否'
},
'physical_punishment': {
'有': '有',
'是': '有',
'有过': '有',
'偶尔有': '有',
'无': '无',
'没有': '无',
'否': '无',
'基本上没有': '无'
},
'service_duration': {
'60天': '60天课程',
'90天': '90天课程',
'180天': '180天课程'
}
};
// 年龄分组
function getAgeGroup(age) {
if (!age || isNaN(age)) return '年龄未知';
const ageNum = parseInt(age);
if (ageNum < 25) return '25岁以下';
else if (ageNum < 35) return '25-35岁';
else if (ageNum < 45) return '35-45岁';
else if (ageNum < 55) return '45-55岁';
else if (ageNum < 65) return '55-65岁';
else if (ageNum < 75) return '65-75岁';
else return '75岁以上';
}
// 学段分组
function gradeToSegment(grade) {
if (!grade) return '学段未知';
const gradeStr = String(grade).toLowerCase();
if (gradeStr.includes('一') || gradeStr.includes('1年')) return '小学低段(1-3年级)';
if (gradeStr.includes('二') || gradeStr.includes('2年')) return '小学低段(1-3年级)';
if (gradeStr.includes('三') || gradeStr.includes('3年')) return '小学低段(1-3年级)';
if (gradeStr.includes('四') || gradeStr.includes('4年')) return '小学高段(4-6年级)';
if (gradeStr.includes('五') || gradeStr.includes('5年')) return '小学高段(4-6年级)';
if (gradeStr.includes('六') || gradeStr.includes('6年')) return '小学高段(4-6年级)';
if (gradeStr.includes('初一')) return '初中前期(初一初二)';
if (gradeStr.includes('初二') || gradeStr.includes('准初')) return '初中前期(初一初二)';
if (gradeStr.includes('初三') || gradeStr.includes('九年')) return '初中毕业班(初三)';
if (gradeStr.includes('高一')) return '高中前期(高一高二)';
if (gradeStr.includes('高二')) return '高中前期(高一高二)';
if (gradeStr.includes('高三')) return '高中毕业班(高三)';
return '学段未知';
}
// 亲子关系分类
function relationshipQuality(text) {
if (!text) return '未指定';
const lowerText = String(text).toLowerCase();
if (lowerText.includes('良好') || lowerText.includes('好') ||
lowerText.includes('和谐') || lowerText.includes('可以') ||
lowerText.includes('还好') || lowerText.includes('较好') ||
lowerText.includes('还可以')) {
return '亲子关系良好';
}
if (lowerText.includes('一般') || lowerText.includes('还行') ||
lowerText.includes('正常') || lowerText.includes('时好时坏')) {
return '亲子关系一般';
}
if (lowerText.includes('不好') || lowerText.includes('差') ||
lowerText.includes('紧张')) {
return '亲子关系较差';
}
return '亲子关系未评估';
}
async function importCleanData() {
try {
console.log(`\n📂 读取 Excel 文件: ${EXCEL_FILE}`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.getWorksheet(1);
if (!worksheet) {
throw new Error('找不到工作表');
}
console.log(`📊 总行数: ${worksheet.rowCount}`);
const db = getDb('onion');
// 初始化数据库
initializeDatabase('onion');
// 创建所有标签分类
console.log('🏗️ 建立分类体系...');
const categoryMap = {};
for (const cat of TAG_CATEGORIES) {
const result = db.prepare(`
INSERT OR IGNORE INTO tag_categories (key, name, sort_order, color)
VALUES (?, ?, ?, ?)
`).run(cat.key, cat.name, 0, cat.color || '#6366f1');
const catRecord = db.prepare(`
SELECT id FROM tag_categories WHERE key = ?
`).get(cat.key);
categoryMap[cat.key] = catRecord.id;
}
console.log(`✅ 创建了 ${Object.keys(categoryMap).length} 个分类`);
// 处理数据行
let insertedCount = 0;
const insertUserStmt = db.prepare(`
INSERT OR IGNORE INTO users (uid, name, extra_json)
VALUES (?, ?, ?)
`);
const insertUserTagStmt = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
const tagCache = {};
function getOrCreateTag(catKey, tagName) {
if (!tagName || !catKey) return null;
const cacheKey = `${catKey}:${tagName}`;
if (tagCache[cacheKey]) return tagCache[cacheKey];
// 先尝试找系统中是否已经有这个标签
let tag = db.prepare(`
SELECT id FROM tags WHERE category_id = ? AND name = ?
`).get(categoryMap[catKey], tagName);
if (!tag) {
// 如果没有生成一个唯一的key
const tagNameNorm = String(tagName).toLowerCase().trim().replace(/\s+/g, '_');
const hashCode = Array.from(tagNameNorm).reduce((h, c) => ((h << 5) - h) + c.charCodeAt(0), 0) & 0xffffff;
let tagKey = `${catKey}_${hashCode.toString(16)}`;
// 检查key冲突
let counter = 1;
while (db.prepare(`SELECT 1 FROM tags WHERE key = ?`).get(tagKey)) {
tagKey = `${catKey}_${hashCode.toString(16)}_${counter}`;
counter++;
}
db.prepare(`
INSERT INTO tags (key, name, category_id, sort_order)
VALUES (?, ?, ?, ?)
`).run(tagKey, tagName, categoryMap[catKey], 0);
tag = db.prepare(`
SELECT id FROM tags WHERE key = ?
`).get(tagKey);
}
tagCache[cacheKey] = tag?.id;
return tag?.id;
}
// 遍历 Excel 数据行
let rowCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return; // 跳过表头
rowCount++;
const values = row.values || [];
// 提取基本信息
const uid = `user_${rowNumber - 1}`; // 简单的用户ID
const guardianRole = values[1];
const childGrade = values[7];
const childDesc = values[16];
if (!guardianRole) {
console.warn(`⚠️ 行 ${rowNumber} 缺少监护人身份,跳过`);
return;
}
// 构建用户额外数据
const extraData = {
row: rowNumber,
guardianRole: guardianRole,
childGrade: childGrade,
childDescription: childDesc ? String(childDesc).substring(0, 500) : ''
};
// 插入用户
const result = insertUserStmt.run(uid, String(guardianRole), JSON.stringify(extraData));
if (result.changes > 0) {
insertedCount++;
const userId = result.lastInsertRowid;
// 为用户添加标签
addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap);
if (rowCount % 30 === 0) {
console.log(` 📝 已处理 ${rowCount} 行...`);
}
}
});
console.log(`\n✅ 用户导入完成:${insertedCount}`);
// 更新所有标签的覆盖统计
console.log('🔄 更新标签统计...');
updateTagStats(db);
console.log('\n📊 数据统计:');
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories
`).get();
console.log(` • 总用户: ${stats.total_users}`);
console.log(` • 总标签: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
db.close();
console.log('\n🎉 导入流程完成!\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
process.exit(1);
}
}
function addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap) {
// 监护人身份
if (values[1]) {
const role = String(values[1]).trim();
const mapped = TAG_MAPPINGS.guardian_role[role] || role;
const tagId = getOrCreateTag('guardian_role', mapped);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 监护人身份: "${role}" -> "${mapped}" (tagId: ${tagId})`);
}
// 文化程度
if (values[2]) {
const edu = String(values[2]).trim();
const mapped = TAG_MAPPINGS.guardian_education[edu] || edu;
const tagId = getOrCreateTag('guardian_education', mapped);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 文化程度: "${edu}" -> "${mapped}" (tagId: ${tagId})`);
}
// 职业(分类)
if (values[3]) {
const job = String(values[3]).trim().toLowerCase();
let jobCategory = '其他';
// 简单的职业分类
if (job.includes('教师') || job.includes('医生') || job.includes('工程') || job.includes('律师')) {
jobCategory = '专业人士';
} else if (job.includes('工人') || job.includes('工厂')) {
jobCategory = '工人';
} else if (job.includes('农') || job.includes('农民') || job.includes('务农')) {
jobCategory = '农民';
} else if (job.includes('员工') || job.includes('职员') || job.includes('公务') || job.includes('干部')) {
jobCategory = '公司/政府工作人员';
} else if (job.includes('退休') || job.includes('离退休')) {
jobCategory = '退休人士';
} else if (job.includes('个体') || job.includes('自由') || job.includes('经营')) {
jobCategory = '个体户/自由职业';
} else if (job.includes('商业') || job.includes('销售')) {
jobCategory = '销售/商业';
} else if (job.includes('家')) {
jobCategory = '家务';
}
const tagId = getOrCreateTag('guardian_occupation', jobCategory);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 职业: "${job}" -> "${jobCategory}" (tagId: ${tagId})`);
}
// 年龄分组
if (values[4]) {
const ageGroup = getAgeGroup(values[4]);
const tagId = getOrCreateTag('guardian_age_group', ageGroup);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 第二监护人身份
if (values[5]) {
const role2 = String(values[5]).trim();
if (role2 && role2 !== '无' && role2 !== '/') {
const mapped = TAG_MAPPINGS.guardian_role[role2] || role2;
const tagId = getOrCreateTag('second_guardian_role', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 孩子性别
if (values[6]) {
const gender = String(values[6]).trim();
const mapped = TAG_MAPPINGS.child_gender[gender] || gender;
const tagId = getOrCreateTag('child_gender', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 孩子学段
if (values[7]) {
const segment = gradeToSegment(values[7]);
const tagId = getOrCreateTag('child_grade', segment);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 学习成绩
if (values[8]) {
const scoreStr = String(values[8]).trim();
// 处理混合值
const scores = scoreStr.split(/[、,]/).map(s => s.trim()).filter(s => s && !s.includes('null'));
for (const score of scores) {
const mapped = TAG_MAPPINGS.child_academic_score[score] || score;
const tagId = getOrCreateTag('child_academic_score', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 家庭结构(关键词提取)
if (values[9]) {
const familyStr = String(values[9]).trim();
const keywords = ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家', '多代'];
const found = new Set();
for (const kw of keywords) {
if (familyStr.includes(kw) && !found.has(kw)) {
found.add(kw);
const tagId = getOrCreateTag('family_structure', kw);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 如果没有识别任何关键词,用原始值
if (found.size === 0) {
const tagId = getOrCreateTag('family_structure', familyStr.substring(0, 50));
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 亲子关系
if (values[10]) {
const relationship = relationshipQuality(values[10]);
const tagId = getOrCreateTag('parent_child_relationship', relationship);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 教育理念一致性
if (values[11]) {
const consensus = String(values[11]).trim();
const mapped = TAG_MAPPINGS.education_consensus[consensus] || (consensus.includes('有') ? '有分歧' : '无分歧');
const tagId = getOrCreateTag('education_consensus', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 是否否定孩子
if (values[12]) {
const negation = String(values[12]).trim();
const mapped = TAG_MAPPINGS.child_negation[negation] || (negation.includes('是') || negation.includes('有') ? '是' : '否');
const tagId = getOrCreateTag('child_negation', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 打骂教育
if (values[13]) {
const punishment = String(values[13]).trim();
const mapped = TAG_MAPPINGS.physical_punishment[punishment] || (punishment.includes('有') ? '有' : '无');
const tagId = getOrCreateTag('physical_punishment', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 孩子与父母同住
if (values[14]) {
const living = String(values[14]).trim();
// 尝试映射,如果映射失败,尝试关键字匹配
let mapped = TAG_MAPPINGS.child_living_with_parents[living];
if (!mapped) {
// 关键字匹配
if (living.includes('是') && !living.includes('不是')) {
mapped = '是';
} else if (living.includes('否') || living.includes('不是')) {
mapped = '否';
} else {
mapped = '是'; // 默认
}
}
const tagId = getOrCreateTag('child_living_with_parents', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 参与养育人员 - 提取关键信息
if (values[15]) {
const caregiverStr = String(values[15]).trim();
if (caregiverStr && caregiverStr !== '无' && caregiverStr !== '没有') {
// 识别主要的养育者
let caregiver = '其他';
if (caregiverStr.includes('妈妈')) caregiver = '母亲';
else if (caregiverStr.includes('父亲') || caregiverStr.includes('爸爸')) caregiver = '父亲';
else if (caregiverStr.includes('爷爷')) caregiver = '祖父';
else if (caregiverStr.includes('奶奶')) caregiver = '祖母';
else if (caregiverStr.includes('外公')) caregiver = '外祖父';
else if (caregiverStr.includes('外婆')) caregiver = '外祖母';
else if (caregiverStr.includes('祖')) caregiver = '祖父母';
else if (caregiverStr.includes('外')) caregiver = '外祖父母';
const tagId = getOrCreateTag('child_caregivers', caregiver);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 服务周期
if (values[17]) {
const duration = String(values[17]).trim();
const mapped = TAG_MAPPINGS.service_duration[duration] || duration;
const tagId = getOrCreateTag('service_duration', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
function updateTagStats(db) {
const tags = db.prepare(`SELECT id FROM tags`).all();
const totalUsers = db.prepare(`SELECT COUNT(*) as n FROM users`).get().n;
for (const tag of tags) {
const result = db.prepare(`
SELECT COUNT(*) as n FROM user_tags WHERE tag_id = ?
`).get(tag.id);
const coverage = result.n || 0;
const coverageRate = totalUsers > 0 ? (coverage / totalUsers * 100).toFixed(2) : 0;
db.prepare(`
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
`).run(coverage, coverageRate, tag.id);
}
}
importCleanData();

414
scripts/import-excel.js Normal file
View File

@@ -0,0 +1,414 @@
/**
* Excel 数据导入脚本 v2
* 将"家庭教育档案-天数.xlsx"中的完整数据导入到数据库
* 支持多维度标签分类
*
* 用法: node scripts/import-excel.js [path/to/file.xlsx]
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = process.argv[2] || path.join(__dirname, '../家庭教育档案-天数.xlsx');
// ────────────────────────────────────
// 标签分类定义
// ────────────────────────────────────
const TAG_CATEGORIES = [
// 1. 监护人信息
{
key: 'guardian_role',
name: '监护人身份',
color: '#3b82f6',
column: 3 // C: 家庭角色
},
{
key: 'guardian_education',
name: '监护人文化程度',
color: '#8b5cf6',
column: 4 // D: 文化程度
},
{
key: 'guardian1_personality',
name: '监护人1性格特征',
color: '#a78bfa',
column: 7 // G: 性格特征
},
{
key: 'guardian2_personality',
name: '监护人2性格特征',
color: '#c084fc',
column: 14 // N: 性格特征_2
},
// 2. 孩子信息
{
key: 'child_gender',
name: '孩子性别',
color: '#ec4899',
column: 17 // Q: 性别
},
{
key: 'child_personality',
name: '孩子性格特征',
color: '#f472b6',
column: 20 // T: 孩子性格特征
},
{
key: 'child_score',
name: '孩子学习成绩',
color: '#f59e0b',
column: 21 // U: 学习成绩
},
// 3. 家庭情况
{
key: 'family_structure',
name: '家庭基本情况',
color: '#06b6d4',
column: 23 // W: 家庭基本情况(含"三代同堂"等)
},
{
key: 'family_atmosphere',
name: '家庭氛围',
color: '#10b981',
column: 24 // X: 家庭氛围
},
{
key: 'parent_child_relation',
name: '亲子关系',
color: '#6366f1',
column: 25 // Y: 亲子关系
},
// 4. 教育行为
{
key: 'education_conflict',
name: '教育理念一致性',
column: 26 // Z: 家长有无教育分歧
},
{
key: 'child_negation',
name: '否定现象',
column: 27 // AA: 是否经常否定孩子
},
{
key: 'physical_punishment',
name: '纪律方式',
column: 28 // AB: 有无打骂教育
},
{
key: 'child_with_parents',
name: '亲子陪伴',
column: 29 // AC: 孩子是否在父母身边长大
},
// 5. 指导周期
{
key: 'duration',
name: '指导周期',
color: '#ef4444',
column: 38 // AL: 天数
}
];
// 标签值映射将Excel值转化为标签
const TAG_VALUE_MAP = {
'guardian_role': {
'母亲': '母亲',
'妈妈': '母亲',
'母': '母亲',
'父亲': '父亲',
'爸爸': '父亲',
'奶奶': '奶奶',
'爷爷': '爷爷',
'外婆': '外婆',
'外公': '外公',
'姥姥': '外婆',
'姥爷': '外公',
'祖母': '奶奶',
'大姐': '成年子女',
'舅舅': '其他亲属',
'妻子': '配偶'
},
'guardian_education': {
'初中': '初中',
'初小': '小学',
'小学': '小学',
'中师': '中专',
'中专': '中专',
'高中': '高中',
'大专': '大专',
'大学': '本科',
'本科': '本科',
'大学本科': '本科',
'硕士': '硕士',
'研究生': '硕士',
'在职研究生': '硕士'
},
'child_gender': {
'女': '女孩',
'男': '男孩',
'女、男': '双胞胎'
},
'child_score': {
'优秀': '优秀',
'良好': '良好',
'一般': '一般',
'差': '较差',
'较差': '较差',
'A': '优秀',
'B': '良好',
'C': '一般',
'D': '较差'
},
'duration': {
'60天': '60天课程',
'180天': '180天课程',
'90天': '90天课程',
'365天': '365天课程'
}
};
// 需要进行关键词提取的字段
const KEYWORD_EXTRACTION_FIELDS = {
'family_structure': {
column: 22,
keywords: ['三代同堂', '四口之家', '三口之家', '单亲', '离异', '隔代抚养', '二代', '三代']
}
};
async function importExcelData() {
try {
console.log(`\n📂 读取 Excel 文件: ${EXCEL_FILE}`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.getWorksheet(1);
if (!worksheet) {
throw new Error('找不到工作表');
}
console.log(`📊 总行数: ${worksheet.rowCount}`);
const db = getDb('onion');
// 初始化数据库
initializeDatabase('onion');
// 创建所有标签分类
console.log('🏗️ 建立分类体系...');
const categoryMap = {};
for (const cat of TAG_CATEGORIES) {
const result = db.prepare(`
INSERT OR IGNORE INTO tag_categories (key, name, sort_order, color)
VALUES (?, ?, ?, ?)
`).run(cat.key, cat.name, 0, cat.color || '#6366f1');
const catRecord = db.prepare(`
SELECT id FROM tag_categories WHERE key = ?
`).get(cat.key);
categoryMap[cat.key] = catRecord.id;
}
console.log(`✅ 创建了 ${Object.keys(categoryMap).length} 个分类`);
// 处理数据行
let insertedCount = 0;
const insertUserStmt = db.prepare(`
INSERT OR IGNORE INTO users (uid, name, extra_json)
VALUES (?, ?, ?)
`);
const insertUserTagStmt = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
// 获取事先创建的标签ID映射
const tagCache = {};
function getOrCreateTag(catKey, tagName) {
if (!tagName || !catKey) return null;
const cacheKey = `${catKey}:${tagName}`;
if (tagCache[cacheKey]) return tagCache[cacheKey];
// 生成唯一的key - 对于长文本(性格特征)使用简化版本
let tagKey;
const isPersonality = catKey.includes('personality');
if (isPersonality && tagName.length > 30) {
// 对于长的性格特征,使用简化的标识符
// 使用前20个字符 + 长度id
const simplified = tagName.substring(0, 20).toLowerCase().replace(/\s+/g, '_').replace(/[^\w]/g, '');
const hash = require('crypto').createHash('md5').update(tagName).digest('hex').substring(0, 8);
tagKey = `${catKey}_${simplified}_${hash}`;
} else {
// 对于其他标签,使用原有方法
tagKey = `${catKey}_${tagName.toLowerCase().replace(/\s+/g, '_').replace(/[^\w]/g, '')}`;
}
const stmt = db.prepare(`
SELECT id FROM tags WHERE key = ?
`);
let tag = stmt.get(tagKey);
if (!tag) {
// 创建新标签
db.prepare(`
INSERT INTO tags (key, name, category_id, sort_order)
VALUES (?, ?, ?, ?)
`).run(tagKey, tagName, categoryMap[catKey], 0);
tag = stmt.get(tagKey);
}
tagCache[cacheKey] = tag?.id;
return tag?.id;
}
// 遍历Excel数据行
let rowCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return; // 跳过表头
rowCount++;
const values = row.values || [];
// 提取基本信息
const fileName = values[1]; // 文件名称
const childName = values[16]; // 孩子姓名
if (!fileName) {
console.warn(`⚠️ 行 ${rowNumber} 缺少文件名,跳过`);
return;
}
// 构建用户额外数据
const extraData = {
fileName: fileName,
childName: childName || '',
guardian1Name: values[2],
childAge: values[17],
grade: values[19],
learningScore: values[21],
familyAddress: values[23],
questionnaireSummary: values[37],
};
// 插入用户
const result = insertUserStmt.run(fileName, childName || fileName, JSON.stringify(extraData));
if (result.changes > 0) {
insertedCount++;
const userId = result.lastInsertRowid;
// 为用户添加标签
addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt);
if (rowCount % 30 === 0) {
console.log(` 📝 已处理 ${rowCount} 行...`);
}
}
});
console.log(`\n✅ 用户导入完成:${insertedCount}`);
// 更新所有标签的覆盖统计
console.log('🔄 更新标签统计...');
updateTagStats(db);
console.log('\n📊 数据统计:');
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories
`).get();
console.log(` • 总用户: ${stats.total_users}`);
console.log(` • 总标签: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
db.close();
console.log('\n🎉 导入流程完成!\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
process.exit(1);
}
}
function addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt) {
for (const cat of TAG_CATEGORIES) {
const colIdx = cat.column;
if (colIdx >= values.length) continue;
let value = values[colIdx];
if (!value) continue;
value = String(value).trim();
// 特殊处理学习成绩的混合值(分解"优秀、良好"为两个标签)
if (cat.key === 'child_score' && value.includes('、')) {
const scores = value.split('、').map(s => s.trim());
for (const score of scores) {
const mapped = TAG_VALUE_MAP[cat.key]?.[score] || score;
const tagId = getOrCreateTag(cat.key, mapped);
if (tagId) {
insertUserTagStmt.run(userId, tagId);
}
}
continue;
}
// 处理值映射
if (TAG_VALUE_MAP[cat.key] && TAG_VALUE_MAP[cat.key][value]) {
value = TAG_VALUE_MAP[cat.key][value];
}
// 获取或创建标签
const tagId = getOrCreateTag(cat.key, value);
if (tagId) {
insertUserTagStmt.run(userId, tagId);
}
// 处理关键词提取
if (KEYWORD_EXTRACTION_FIELDS[cat.key]) {
const keywords = KEYWORD_EXTRACTION_FIELDS[cat.key].keywords;
for (const keyword of keywords) {
if (value.includes(keyword)) {
const kwTagId = getOrCreateTag(cat.key, keyword);
if (kwTagId) {
insertUserTagStmt.run(userId, kwTagId);
}
}
}
}
}
}
function updateTagStats(db) {
const tags = db.prepare(`SELECT id FROM tags`).all();
const totalUsers = db.prepare(`SELECT COUNT(*) as n FROM users`).get().n;
for (const tag of tags) {
const result = db.prepare(`
SELECT COUNT(*) as n FROM user_tags WHERE tag_id = ?
`).get(tag.id);
const coverage = result.n || 0;
const coverageRate = totalUsers > 0 ? (coverage / totalUsers * 100).toFixed(2) : 0;
db.prepare(`
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
`).run(coverage, coverageRate, tag.id);
}
}
importExcelData();

View File

@@ -0,0 +1,192 @@
/**
* 从清洗1.0.xlsx 中导入标签数据到现有的清洗2.0 用户
*
* 策略:
* 1. 读取清洗1.0.xlsx 的标签列18-31
* 2. 尝试通过前7列数据匹配清洗2.0中的用户
* 3. 导入匹配到的标签
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb } = require('../db/init');
async function main() {
try {
console.log('\n╔════════════════════════════════════════════════════════════════╗');
console.log('║ 📥 从清洗1.0 导入标签数据 ║');
console.log('╚════════════════════════════════════════════════════════════════╝\n');
// 标签分类与列的映射
const TAG_COLUMN_MAP = {
18: { catKey: 'user_identity', catName: '用户身份标签' },
19: { catKey: 'user_age_group', catName: '用户年龄段标签' },
20: { catKey: 'child_grade', catName: '孩子学段标签' },
21: { catKey: 'family_structure', catName: '家庭结构标签' },
22: { catKey: 'education_risk', catName: '教育风险标签' },
23: { catKey: 'family_support', catName: '家庭支持度标签' },
24: { catKey: 'payment_ability', catName: '付费能力标签' },
25: { catKey: 'urgency', catName: '需求紧迫度标签' },
26: { catKey: 'core_problem', catName: '核心问题标签' },
27: { catKey: 'intervention_difficulty', catName: '干预难度标签' },
28: { catKey: 'conversion_priority', catName: '转化优先级标签' },
29: { catKey: 'service_duration', catName: '服务周期标签' },
30: { catKey: 'channel_adaption', catName: '渠道适配标签' },
31: { catKey: 'product_match', catName: '产品匹配标签' }
};
const db = getDb('onion');
// 读取清洗1.0
console.log('📖 读取清洗1.0.xlsx...');
const wb1 = new ExcelJS.Workbook();
await wb1.xlsx.readFile(path.join(__dirname, '../清洗1.0.xlsx'));
const ws1 = wb1.worksheets[0];
// 读取清洗2.0
console.log('📖 读取清洗2.0.xlsx...');
const wb2 = new ExcelJS.Workbook();
await wb2.xlsx.readFile(path.join(__dirname, '../清洗2.0.xlsx'));
const ws2 = wb2.worksheets[0];
// 构建1.0的用户映射前7列作为key
const map1 = {};
const tagData1 = {};
ws1.eachRow((row, rowNum) => {
if (rowNum === 1) return; // skip header
// 生成key
const key = [1,2,3,4,5,6,7].map(c => {
const v = row.values[c];
return v ? String(v).trim() : '';
}).join('|');
map1[key] = rowNum;
// 存储标签数据
const tags = {};
for (const [col, info] of Object.entries(TAG_COLUMN_MAP)) {
const tagValue = row.values[parseInt(col)];
if (tagValue && String(tagValue).trim() !== '') {
if (!tags[info.catKey]) tags[info.catKey] = [];
tags[info.catKey].push(String(tagValue).trim());
}
}
tagData1[key] = tags;
});
console.log(` • 清洗1.0 索引: ${Object.keys(map1).length}\n`);
// 匹配清洗2.0的用户
let matched = 0;
let tagInserted = 0;
const tagCache = {};
const insertTagStmt = db.prepare(`
INSERT OR IGNORE INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
VALUES (?, ?, ?, 0, 0, 0)
`);
const getTagIdStmt = db.prepare(`
SELECT id FROM tags WHERE category_id = ? AND name = ?
`);
const insertUserTagStmt = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
// 获取分类ID映射
const catIdMap = {};
const categories = db.prepare('SELECT id, key FROM tag_categories').all();
for (const cat of categories) {
catIdMap[cat.key] = cat.id;
}
console.log('🔗 匹配清洗2.0用户...\n');
ws2.eachRow((row, rowNum) => {
if (rowNum === 1) return; // skip header
const key = [1,2,3,4,5,6,7].map(c => {
const v = row.values[c];
return v ? String(v).trim() : '';
}).join('|');
if (!map1[key]) return;
// 获取清洗2.0中的用户ID
const userKey = `user_${rowNum}`;
const user = db.prepare('SELECT id FROM users WHERE uid = ?').get(userKey);
if (!user) return;
// 导入标签
const tags = tagData1[key];
for (const [catKey, tagValues] of Object.entries(tags)) {
const catId = catIdMap[catKey];
if (!catId) continue;
for (const tagValue of tagValues) {
const cacheKey = `${catId}:${tagValue}`;
let tagId = tagCache[cacheKey];
if (!tagId) {
// 尝试获取存在的标签
let existing = getTagIdStmt.get(catId, tagValue);
if (existing) {
tagId = existing.id;
} else {
// 创建新标签
insertTagStmt.run(
`${catKey}_${Math.random().toString(36).slice(2)}`,
tagValue,
catId
);
const result = getTagIdStmt.get(catId, tagValue);
tagId = result.id;
}
tagCache[cacheKey] = tagId;
}
if (tagId) {
insertUserTagStmt.run(user.id, tagId);
tagInserted++;
}
}
}
matched++;
if (matched % 500 === 0) {
console.log(` ✓ 已匹配 ${matched} 行...`);
}
});
console.log(`\n✅ 标签导入完成:`);
console.log(` • 匹配用户: ${matched}`);
console.log(` • 导入标签链接: ${tagInserted}`);
// 显示统计
console.log('\n📊 标签分布:');
const tagStats = db.prepare(`
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.id
LIMIT 16
`).all();
for (const stat of tagStats) {
console.log(`${stat.name}: ${stat.tag_count} tags, ${stat.user_count || 0} users`);
}
db.close();
} catch (e) {
console.error('❌ Error:', e.message);
process.exit(1);
}
}
main();

168
scripts/merge-tags-v2.js Normal file
View File

@@ -0,0 +1,168 @@
const { getDb } = require('../db/init');
const db = getDb('onion');
// 精确的家庭角色标签映射 - 基于实际数据
const MERGE_MAPPING = {
'家庭角色': {
'妈妈': ['母亲', '母親', '孩子母亲', '孩子妈妈', '全职妈妈', '妈咪', '蚂妈', '妈妈一', '妈妈初', '妈妈大专', '母', '女主人', '母亲初初', '母亲中中中', '家庭主妇', '照孩子'],
'父亲': ['爸爸', '父', '爸', '养父'],
'奶奶': ['祖母'],
'姥姥': ['姥爷'],
'爷爷': ['祖父'],
'外婆': ['外公'],
}
};
// 需要删除的错误标签(无实际意义或属于其他分类)
const INVALID_TAGS = ['初中', '文 化', ''];
function mergeTags() {
try {
console.log('🔄 开始合并同类标签...\n');
let totalMerged = 0;
let totalDeleted = 0;
// 处理每个分类的映射
for (const [categoryName, tagMappings] of Object.entries(MERGE_MAPPING)) {
console.log(`\n📁 分类: ${categoryName}`);
// 获取分类ID
const categoryResult = db.prepare(
'SELECT id FROM tag_categories WHERE name = ?'
).get(categoryName);
if (!categoryResult) {
console.log(`❌ 无法找到分类: ${categoryName}`);
continue;
}
const categoryId = categoryResult.id;
// 处理每个主标签的映射
for (const [masterTagName, synonyms] of Object.entries(tagMappings)) {
console.log(`\n 主标签: ${masterTagName}`);
// 获取主标签
const masterTag = db.prepare(
'SELECT id FROM tags WHERE name = ? AND category_id = ?'
).get(masterTagName, categoryId);
if (!masterTag) {
console.log(` ❌ 主标签 "${masterTagName}" 不存在`);
continue;
}
const masterTagId = masterTag.id;
// 合并每个同义词
for (const synonym of synonyms) {
const synonymTag = db.prepare(
'SELECT id FROM tags WHERE name = ? AND category_id = ?'
).get(synonym, categoryId);
if (!synonymTag) {
console.log(` ⚠️ 同义词 "${synonym}" 不存在,跳过`);
continue;
}
const synonymTagId = synonymTag.id;
// 获取同义词的用户数
const userCountResult = db.prepare(
'SELECT COUNT(DISTINCT user_id) as count FROM user_tags WHERE tag_id = ?'
).get(synonymTagId);
const userCount = userCountResult?.count || 0;
// 转移用户关系到主标签
db.prepare(
`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`
).run(masterTagId, synonymTagId);
// 删除同义词的所有关系
db.prepare(
'DELETE FROM user_tags WHERE tag_id = ?'
).run(synonymTagId);
// 删除同义词标签记录
db.prepare(
'DELETE FROM tags WHERE id = ?'
).run(synonymTagId);
console.log(` ✅ 合并 "${synonym}" (${userCount} 用户) → "${masterTagName}"`);
totalMerged++;
}
// 更新主标签的覆盖率
const newCoverageResult = db.prepare(
'SELECT COUNT(DISTINCT user_id) as count FROM user_tags WHERE tag_id = ?'
).get(masterTagId);
const newCoverage = newCoverageResult?.count || 0;
const totalUsers = 1929; // 从之前的统计
const coverageRate = ((newCoverage / totalUsers) * 100).toFixed(2);
db.prepare(
'UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?'
).run(newCoverage, parseFloat(coverageRate), masterTagId);
console.log(` 📊 "${masterTagName}" 新覆盖: ${newCoverage} 用户 (${coverageRate}%)`);
}
}
// 删除无效标签
console.log(`\n\n🗑️ 删除无效标签...`);
for (const invalidTagName of INVALID_TAGS) {
const invalidTag = db.prepare(
'SELECT id FROM tags WHERE name = ?'
).get(invalidTagName);
if (invalidTag) {
db.prepare('DELETE FROM user_tags WHERE tag_id = ?').run(invalidTag.id);
db.prepare('DELETE FROM tags WHERE id = ?').run(invalidTag.id);
console.log(` ✅ 删除无效标签: "${invalidTagName}"`);
totalDeleted++;
}
}
console.log(`\n\n✨ 合并完成!`);
console.log(`📊 统计:`);
console.log(` • 合并的同义词: ${totalMerged}`);
console.log(` • 删除的无效标签: ${totalDeleted}`);
// 显示合并前后统计
const tagCountResult = db.prepare('SELECT COUNT(*) as count FROM tags').get();
const userTagCountResult = db.prepare('SELECT COUNT(*) as count FROM user_tags').get();
console.log(` • 剩余标签总数: ${tagCountResult.count}`);
console.log(` • 用户-标签关系总数: ${userTagCountResult.count}`);
// 显示家庭角色分类的最新状态
console.log(`\n📋 家庭角色分类的最新状态:`);
const finalTags = db.prepare(
`SELECT name, coverage, coverage_rate
FROM tags
WHERE category_id = (SELECT id FROM tag_categories WHERE name = '家庭角色')
ORDER BY coverage DESC`
).all();
finalTags.forEach((tag) => {
console.log(`${tag.name}: ${tag.coverage} 用户 (${tag.coverage_rate}%)`);
});
console.log(`\n✨ 总计: ${finalTags.length} 个家庭角色标签`);
console.log(`\n💡 提示: 请执行以下命令重启服务器以清除缓存:`);
console.log(` pkill -f "node server.js" && sleep 2 && node server.js &\n`);
db.close();
process.exit(0);
} catch (error) {
console.error('❌ 错误:', error);
db.close();
process.exit(1);
}
}
mergeTags();

144
scripts/merge-tags.js Normal file
View File

@@ -0,0 +1,144 @@
/**
* 合并同义标签脚本
* 定义同义词映射,将重复标签合并到主标签
*/
const { getDb } = require('../db/init');
// 定义各分类的同义词映射
// 格式: { master_tag: [synonym1, synonym2, ...] }
const MERGE_MAPPING = {
// 家庭角色 - 保留简洁、规范的版本
'家庭角色': {
'妈妈': ['母亲', '母親', '孩子母亲', '孩子妈妈', '全职妈妈', '妈咪', '蚂妈', '妈妈一', '妈妈初', '妈妈大专', '妈', '女主人'],
'爸爸': ['父亲', '父', '爸'],
'奶奶': ['祖母'],
'爷爷': ['祖父'],
'外婆': ['外公 alternate'], // 外公是另一个性别
'姥姥': ['姥爷'],
},
// 其他分类暂不合并
};
async function mergeTags() {
const db = getDb('onion');
try {
console.log('\n' + '='.repeat(70));
console.log('🔗 开始合并同义标签');
console.log('='.repeat(70) + '\n');
let totalMerged = 0;
let totalDeleted = 0;
for (const [categoryName, mapping] of Object.entries(MERGE_MAPPING)) {
console.log(`\n📂 处理分类: ${categoryName}`);
console.log('-'.repeat(70));
// 获取分类ID
const category = db.prepare(`
SELECT id FROM tag_categories WHERE name = ?
`).get(categoryName);
if (!category) {
console.log(` ⚠️ 分类不存在`);
continue;
}
const categoryId = category.id;
// 处理每个主标签的同义词列表
for (const [masterName, synonyms] of Object.entries(mapping)) {
// 获取主标签
const masterTag = db.prepare(`
SELECT id, coverage FROM tags
WHERE category_id = ? AND name = ?
`).get(categoryId, masterName);
if (!masterTag) {
console.log(` ⚠️ 主标签不存在: ${masterName}`);
continue;
}
console.log(`\n ✓ 主标签: ${masterName} (ID: ${masterTag.id}, 用户数: ${masterTag.coverage})`);
// 处理每个同义词
for (const synonym of synonyms) {
const synonymTag = db.prepare(`
SELECT id, coverage FROM tags
WHERE category_id = ? AND name = ?
`).get(categoryId, synonym);
if (!synonymTag) {
console.log(`${synonym} (不存在,跳过)`);
continue;
}
console.log(` • 合并 ${synonym} (ID: ${synonymTag.id}, 用户数: ${synonymTag.coverage})`);
// 1. 将同义标签的所有用户关系转移到主标签
const moveStmt = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?
`);
moveStmt.run(masterTag.id, synonymTag.id);
// 2. 删除同义标签的所有用户关系
db.prepare('DELETE FROM user_tags WHERE tag_id = ?').run(synonymTag.id);
// 3. 删除同义标签
db.prepare('DELETE FROM tags WHERE id = ?').run(synonymTag.id);
totalMerged++;
totalDeleted++;
}
// 更新主标签的统计信息
const newCoverage = db.prepare(`
SELECT COUNT(DISTINCT user_id) as cnt FROM user_tags WHERE tag_id = ?
`).get(masterTag.id);
const coverage = newCoverage.cnt || 0;
const totalUsers = db.prepare('SELECT COUNT(*) as n FROM users').get().n;
const coverage_rate = totalUsers > 0 ? +(coverage / totalUsers * 100).toFixed(2) : 0;
db.prepare(`
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
`).run(coverage, coverage_rate, masterTag.id);
console.log(` ✅ 更新主标签统计: ${coverage} 用户 (${coverage_rate}%)`);
}
}
console.log('\n' + '='.repeat(70));
console.log(`✅ 合并完成`);
console.log(` • 合并数量: ${totalMerged} 个同义标签`);
console.log(` • 删除数量: ${totalDeleted} 个重复标签`);
console.log('='.repeat(70) + '\n');
// 显示合并后的统计
console.log('📊 合并后的分类统计:');
const stats = db.prepare(`
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count,
ROUND(COUNT(DISTINCT ut.user_id) * 100.0 / (SELECT COUNT(*) FROM users), 1) as coverage
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.sort_order
`).all();
for (const stat of stats) {
console.log(`${stat.name.padEnd(20)}: ${stat.tag_count} tags, ${stat.user_count || 0} users (${stat.coverage || 0}%)`);
}
db.close();
} catch (e) {
console.error('❌ 错误:', e.message);
console.error(e);
db.close();
process.exit(1);
}
}
mergeTags();

124
scripts/quality-check-1.py Normal file
View File

@@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""全面质量检查脚本"""
import openpyxl
import sqlite3
print("\n" + "="*70)
print("🔍 全面质量检查")
print("="*70 + "\n")
# ============================================================================
# 1. Excel 文件对比
# ============================================================================
print("1⃣ EXCEL 文件结构和内容对比")
print("-"*70 + "\n")
wb0 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/家庭教育档案-天数.xlsx')
wb1 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗1.0.xlsx')
wb2 = openpyxl.load_workbook('/Users/inkling/Desktop/dmp/清洗2.0.xlsx')
ws0, ws1, ws2 = wb0.active, wb1.active, wb2.active
print(f"📊 行列统计:")
print(f" 原始(家庭教育档案-天数): {ws0.max_row} rows × {ws0.max_column} cols")
print(f" 清洗1.0: {ws1.max_row} rows × {ws1.max_column} cols")
print(f" 清洗2.0: {ws2.max_row} rows × {ws2.max_column} cols")
# 列结构对比
print(f"\n📋 列结构对比:")
print(f" {'':<3} {'原始':<25} {'清洗1.0':<25} {'清洗2.0':<25} {'状态':<5}")
print(f" {'-'*3} {'-'*25} {'-'*25} {'-'*25} {'-'*5}")
for col in range(1, 17):
h0 = str(ws0.cell(1, col).value or '')[:22]
h1 = str(ws1.cell(1, col).value or '')[:22]
h2 = str(ws2.cell(1, col).value or '')[:22]
match = "" if h1 == h2 else ""
print(f" {col:<3} {h0:<25} {h1:<25} {h2:<25} {match:<5}")
# 数据完整性检查
print(f"\n✅ 数据完整性 (前100行检查):")
def check_null_rate(ws, start_col=1, end_col=16, rows=100):
results = {}
for col in range(start_col, min(end_col + 1, ws.max_column + 1)):
nulls = 0
total = 0
for row in range(2, min(rows + 2, ws.max_row + 1)):
total += 1
if ws.cell(row, col).value is None:
nulls += 1
if total > 0:
results[col] = (nulls, total, 100 * nulls / total)
return results
nulls1 = check_null_rate(ws1)
nulls2 = check_null_rate(ws2)
print(f" 清洗1.0: ", end="")
if all(rate == 0 for _, _, rate in nulls1.values()):
print("✓ 完全无缺失值")
else:
for col, (n, t, rate) in sorted(nulls1.items()):
if rate > 0:
print(f"{col}({rate:.0f}%) ", end="")
print(f"\n 清洗2.0: ", end="")
if all(rate == 0 for _, _, rate in nulls2.values()):
print("✓ 完全无缺失值")
else:
for col, (n, t, rate) in sorted(nulls2.items()):
if rate > 0:
print(f"{col}({rate:.0f}%) ", end="")
print()
# ============================================================================
# 2. 数据库内容检查
# ============================================================================
print(f"\n\n2⃣ 数据库内容检查")
print("-"*70 + "\n")
conn = sqlite3.connect('/Users/inkling/Desktop/dmp/dmp_onion.db')
cursor = conn.cursor()
# 用户数据
cursor.execute('SELECT COUNT(*) FROM users')
user_count = cursor.fetchone()[0]
print(f"👥 用户数: {user_count}")
# 标签数据
cursor.execute('SELECT COUNT(*) FROM tags')
tag_count = cursor.fetchone()[0]
print(f"🏷️ 标签数: {tag_count}")
# 分类数据
cursor.execute('SELECT COUNT(*) FROM tag_categories')
cat_count = cursor.fetchone()[0]
print(f"📂 分类数: {cat_count}")
# 关系数据
cursor.execute('SELECT COUNT(*) FROM user_tags')
rel_count = cursor.fetchone()[0]
print(f"🔗 关系数: {rel_count}")
# 分类分布
print(f"\n📊 标签分类分布:")
cursor.execute('''
SELECT tc.name, COUNT(DISTINCT t.id) as tag_count,
COUNT(DISTINCT ut.user_id) as user_count,
COUNT(ut.id) as rel_count
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.id
''')
for row in cursor.fetchall():
name, tags, users, rels = row
coverage = f"{(users*100/user_count):.0f}%" if users else "0%"
print(f"{name:<20} {tags:3d} tags, {users:4d} users ({coverage:>3s}), {rels:5d} relations")
conn.close()
print("\n" + "="*70)