const { getDb } = require('../db/init'); const db = getDb('onion'); const CATEGORY_KEY = 'basic_info_role'; const RENAME_MAP = new Map([ // 妈妈系 ['母', '妈妈'], ['妈', '妈妈'], ['母亲', '妈妈'], ['母 亲', '妈妈'], ['母親', '妈妈'], ['毋亲', '妈妈'], ['妈 妈', '妈妈'], ['妈吗', '妈妈'], ['妈好', '妈妈'], ['妈专', '妈妈'], ['蚂妈', '妈妈'], ['宝妈', '妈妈'], ['全职妈妈', '妈妈'], ['家庭主妇', '妈妈'], ['主妇', '妈妈'], ['家家庭主妇', '妈妈'], ['女主人', '妈妈'], // 爸爸系 ['父', '爸爸'], ['爸', '爸爸'], ['父亲', '爸爸'], ['父 亲', '爸爸'], ['孩子爸', '爸爸'], ['爸专', '爸爸'], ['爸备', '爸爸'], // 祖辈系 ['祖父', '爷爷'], ['姥爷', '外公'], ['外爷', '外公'], ['祖母', '奶奶'], ['姥姥', '外婆'], ['姥姥/外婆', '外婆'], ['外婆', '外婆'], ['婆婆', '奶奶'], // 其他明确亲属 ['姑妈', '姑姑'], ]); // 这些值属于家庭角色中的明确亲属关系,保留即可 const KEEP_SET = new Set([ '妈妈', '爸爸', '爷爷', '奶奶', '外公', '外婆', '姑姑', '舅舅', '姨妈', '伯娘', '继母', '妻子', '女儿', '儿子', '姐姐', '父母', '家长', '其他监护人', ]); // 明显不是家庭角色的噪声、描述、乱码、占位符 const DELETE_EXACT = new Set([ '上班族', '母性', '女', '主', '主妇', '全职', '母中', '母女', '母子', '一般', '陪读', '父母', '母家', '高中', '经济', '无', '目前', '内勤', '带娃', '白黑', '家长', '全能', '次', '普通', '好人', '主导', '主角', '主内', '主&角初中', '初中', '文 化', '/', 'I', '13296773713', '盛自根', '经济支柱', '经济、教育、生活是核心', '助推庭教育', '呵护,做具体事', '教育陪伴孩子', '照孩子', '家庭主妇', '家家庭主妇', '妈专', '妈好', '妈吗', '妈 妈', '父 亲', '妈 亲', '母 亲', '母親', '母', '父', '爸', '孩子爸', '爸专', '爸备', '宝妈', '蚂妈', '毋亲', '外爷', '姥爷', '祖父', '祖母', '姑妈', '婆婆', '女主人', '母亲', ]); const DELETE_PATTERNS = [ /^\d+$/, // 数字 /^[\s\W_]+$/, // 纯符号/空白 /联系方式|电话|手机号|微信/, // 联系方式片段 /上班|内勤|经济|教育|陪伴|助推|呵护|主导|主角|全能|普通|一般|目前|无|好人|次/, /家庭主妇|主妇|全职|陪读|带娃/, /文化|初中|高中|白黑|盛自根/, ]; function canonicalizeName(rawName) { const name = String(rawName || '').trim(); if (!name) return null; if (RENAME_MAP.has(name)) return RENAME_MAP.get(name); return name; } function shouldDelete(name) { if (DELETE_EXACT.has(name)) return true; return DELETE_PATTERNS.some((re) => re.test(name)); } function updateStats(dbConn) { const totalUsers = dbConn.prepare('SELECT COUNT(*) AS n FROM users').get().n || 1; const tags = dbConn.prepare('SELECT id FROM tags').all(); const stmt = dbConn.prepare(` UPDATE tags SET coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id), coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2) WHERE id = ? `); for (const tag of tags) stmt.run(totalUsers, tag.id); } function main() { try { const category = db.prepare('SELECT id FROM tag_categories WHERE key = ?').get(CATEGORY_KEY); if (!category) throw new Error(`找不到分类: ${CATEGORY_KEY}`); const catId = category.id; const tags = db.prepare('SELECT id, name FROM tags WHERE category_id = ?').all(catId); console.log('🧹 开始清理家庭角色噪声数据...'); console.log(`📂 当前标签数: ${tags.length}`); let merged = 0; let deleted = 0; let kept = 0; const tx = db.transaction(() => { const getByName = db.prepare('SELECT id, name FROM tags WHERE category_id = ? AND name = ?'); const insertRel = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)'); const deleteRel = db.prepare('DELETE FROM user_tags WHERE tag_id = ?'); const deleteTag = db.prepare('DELETE FROM tags WHERE id = ?'); const updateTag = db.prepare('UPDATE tags SET name = ? WHERE id = ?'); for (const tag of tags) { const originalName = String(tag.name || '').trim(); const canonicalName = canonicalizeName(originalName); if (KEEP_SET.has(originalName)) { kept += 1; continue; } if (canonicalName && canonicalName !== originalName && KEEP_SET.has(canonicalName)) { const target = getByName.get(catId, canonicalName); if (target) { // 先把关系迁移过去,再删除旧标签 db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id) SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id); deleteRel.run(tag.id); deleteTag.run(tag.id); console.log(`✅ 合并: ${originalName} -> ${canonicalName}`); merged += 1; } continue; } // 未在保留名单中:如果是明显噪声则删除 if (shouldDelete(originalName)) { deleteRel.run(tag.id); deleteTag.run(tag.id); console.log(`🗑️ 删除: ${originalName}`); deleted += 1; continue; } // 其他未明确规则的值:保守处理,保留但不改名 kept += 1; } // 额外处理:把一些未能通过 canonicalize 但明显可归类到妈妈/爸爸的值再扫一遍 const leftovers = db.prepare('SELECT id, name FROM tags WHERE category_id = ?').all(catId); for (const tag of leftovers) { const name = String(tag.name || '').trim(); if (KEEP_SET.has(name)) continue; if (/妈|母|宝妈/.test(name)) { const target = getByName.get(catId, '妈妈'); if (target && target.id !== tag.id) { db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id) SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id); deleteRel.run(tag.id); deleteTag.run(tag.id); console.log(`✅ 合并(兜底): ${name} -> 妈妈`); merged += 1; continue; } } if (/爸|父|孩子爸/.test(name)) { const target = getByName.get(catId, '爸爸'); if (target && target.id !== tag.id) { db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id) SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id); deleteRel.run(tag.id); deleteTag.run(tag.id); console.log(`✅ 合并(兜底): ${name} -> 爸爸`); merged += 1; continue; } } } }); tx(); updateStats(db); const stats = db.prepare(` SELECT (SELECT COUNT(*) FROM tags t JOIN tag_categories c ON c.id=t.category_id WHERE c.key = ?) AS tag_count, (SELECT COUNT(*) FROM user_tags) AS rel_count, (SELECT COUNT(*) FROM tags WHERE name = '妈妈') AS mom_count, (SELECT COUNT(*) FROM tags WHERE name = '爸爸') AS dad_count, (SELECT COUNT(*) FROM tags WHERE name = '爷爷') AS grandpa_count, (SELECT COUNT(*) FROM tags WHERE name = '外公') AS mgp_count, (SELECT COUNT(*) FROM tags WHERE name = '外婆') AS mgm_count `).get(CATEGORY_KEY); console.log('\n✨ 清理完成'); console.log(` • 合并: ${merged}`); console.log(` • 删除: ${deleted}`); console.log(` • 保留(未改名): ${kept}`); console.log(` • 家庭角色标签剩余: ${stats.tag_count}`); console.log(` • 妈妈/爸爸/爷爷/外公/外婆 计数: ${stats.mom_count}/${stats.dad_count}/${stats.grandpa_count}/${stats.mgp_count}/${stats.mgm_count}`); db.close(); } catch (error) { console.error('❌ 清理失败:', error); try { db.close(); } catch (_) {} process.exit(1); } } main();