Files
onion-dmp/scripts/clean-family-role-noise-v2.js
2026-04-08 14:52:09 +08:00

224 lines
8.1 KiB
JavaScript

const { getDb } = require('../db/init');
const db = getDb('onion');
const CATEGORY_KEY = 'basic_info_role';
const RENAME_MAP = new Map([
// 妈妈系
['母', '妈妈'],
['妈', '妈妈'],
['母亲', '妈妈'],
['母 亲', '妈妈'],
['母親', '妈妈'],
['毋亲', '妈妈'],
['妈 妈', '妈妈'],
['妈吗', '妈妈'],
['妈好', '妈妈'],
['妈专', '妈妈'],
['蚂妈', '妈妈'],
['宝妈', '妈妈'],
['全职妈妈', '妈妈'],
['家庭主妇', '妈妈'],
['主妇', '妈妈'],
['家家庭主妇', '妈妈'],
['女主人', '妈妈'],
// 爸爸系
['父', '爸爸'],
['爸', '爸爸'],
['父亲', '爸爸'],
['父 亲', '爸爸'],
['孩子爸', '爸爸'],
['爸专', '爸爸'],
['爸备', '爸爸'],
// 祖辈系
['祖父', '爷爷'],
['姥爷', '外公'],
['外爷', '外公'],
['祖母', '奶奶'],
['姥姥', '外婆'],
['姥姥/外婆', '外婆'],
['外婆', '外婆'],
['婆婆', '奶奶'],
// 其他明确亲属
['姑妈', '姑姑'],
]);
// 这些值属于家庭角色中的明确亲属关系,保留即可
const KEEP_SET = new Set([
'妈妈', '爸爸', '爷爷', '奶奶', '外公', '外婆',
'姑姑', '舅舅', '姨妈', '伯娘', '继母', '妻子',
'女儿', '儿子', '姐姐', '父母', '家长', '其他监护人',
]);
// 明显不是家庭角色的噪声、描述、乱码、占位符
const DELETE_EXACT = new Set([
'上班族', '母性', '女', '主', '主妇', '全职', '母中', '母女', '母子',
'一般', '陪读', '父母', '母家', '高中', '经济', '无', '目前', '内勤',
'带娃', '白黑', '家长', '全能', '次', '普通', '好人', '主导', '主角',
'主内', '主&角初中', '初中', '文 化', '/', 'I', '13296773713',
'盛自根', '经济支柱', '经济、教育、生活是核心', '助推庭教育',
'呵护,做具体事', '教育陪伴孩子', '照孩子', '家庭主妇', '家家庭主妇',
'妈专', '妈好', '妈吗', '妈 妈', '父 亲', '妈 亲', '母 亲', '母親',
'母', '父', '爸', '孩子爸', '爸专', '爸备', '宝妈', '蚂妈', '毋亲',
'外爷', '姥爷', '祖父', '祖母', '姑妈', '婆婆', '女主人', '母亲',
]);
const DELETE_PATTERNS = [
/^\d+$/, // 数字
/^[\s\W_]+$/, // 纯符号/空白
/联系方式|电话|手机号|微信/, // 联系方式片段
/上班|内勤|经济|教育|陪伴|助推|呵护|主导|主角|全能|普通|一般|目前|无|好人|次/,
/家庭主妇|主妇|全职|陪读|带娃/,
/文化|初中|高中|白黑|盛自根/,
];
function canonicalizeName(rawName) {
const name = String(rawName || '').trim();
if (!name) return null;
if (RENAME_MAP.has(name)) return RENAME_MAP.get(name);
return name;
}
function shouldDelete(name) {
if (DELETE_EXACT.has(name)) return true;
return DELETE_PATTERNS.some((re) => re.test(name));
}
function updateStats(dbConn) {
const totalUsers = dbConn.prepare('SELECT COUNT(*) AS n FROM users').get().n || 1;
const tags = dbConn.prepare('SELECT id FROM tags').all();
const stmt = dbConn.prepare(`
UPDATE tags
SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
WHERE id = ?
`);
for (const tag of tags) stmt.run(totalUsers, tag.id);
}
function main() {
try {
const category = db.prepare('SELECT id FROM tag_categories WHERE key = ?').get(CATEGORY_KEY);
if (!category) throw new Error(`找不到分类: ${CATEGORY_KEY}`);
const catId = category.id;
const tags = db.prepare('SELECT id, name FROM tags WHERE category_id = ?').all(catId);
console.log('🧹 开始清理家庭角色噪声数据...');
console.log(`📂 当前标签数: ${tags.length}`);
let merged = 0;
let deleted = 0;
let kept = 0;
const tx = db.transaction(() => {
const getByName = db.prepare('SELECT id, name FROM tags WHERE category_id = ? AND name = ?');
const insertRel = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
const deleteRel = db.prepare('DELETE FROM user_tags WHERE tag_id = ?');
const deleteTag = db.prepare('DELETE FROM tags WHERE id = ?');
const updateTag = db.prepare('UPDATE tags SET name = ? WHERE id = ?');
for (const tag of tags) {
const originalName = String(tag.name || '').trim();
const canonicalName = canonicalizeName(originalName);
if (KEEP_SET.has(originalName)) {
kept += 1;
continue;
}
if (canonicalName && canonicalName !== originalName && KEEP_SET.has(canonicalName)) {
const target = getByName.get(catId, canonicalName);
if (target) {
// 先把关系迁移过去,再删除旧标签
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并: ${originalName} -> ${canonicalName}`);
merged += 1;
}
continue;
}
// 未在保留名单中:如果是明显噪声则删除
if (shouldDelete(originalName)) {
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`🗑️ 删除: ${originalName}`);
deleted += 1;
continue;
}
// 其他未明确规则的值:保守处理,保留但不改名
kept += 1;
}
// 额外处理:把一些未能通过 canonicalize 但明显可归类到妈妈/爸爸的值再扫一遍
const leftovers = db.prepare('SELECT id, name FROM tags WHERE category_id = ?').all(catId);
for (const tag of leftovers) {
const name = String(tag.name || '').trim();
if (KEEP_SET.has(name)) continue;
if (/妈|母|宝妈/.test(name)) {
const target = getByName.get(catId, '妈妈');
if (target && target.id !== tag.id) {
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并(兜底): ${name} -> 妈妈`);
merged += 1;
continue;
}
}
if (/爸|父|孩子爸/.test(name)) {
const target = getByName.get(catId, '爸爸');
if (target && target.id !== tag.id) {
db.prepare(`INSERT OR IGNORE INTO user_tags (user_id, tag_id)
SELECT user_id, ? FROM user_tags WHERE tag_id = ?`).run(target.id, tag.id);
deleteRel.run(tag.id);
deleteTag.run(tag.id);
console.log(`✅ 合并(兜底): ${name} -> 爸爸`);
merged += 1;
continue;
}
}
}
});
tx();
updateStats(db);
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM tags t JOIN tag_categories c ON c.id=t.category_id WHERE c.key = ?) AS tag_count,
(SELECT COUNT(*) FROM user_tags) AS rel_count,
(SELECT COUNT(*) FROM tags WHERE name = '妈妈') AS mom_count,
(SELECT COUNT(*) FROM tags WHERE name = '爸爸') AS dad_count,
(SELECT COUNT(*) FROM tags WHERE name = '爷爷') AS grandpa_count,
(SELECT COUNT(*) FROM tags WHERE name = '外公') AS mgp_count,
(SELECT COUNT(*) FROM tags WHERE name = '外婆') AS mgm_count
`).get(CATEGORY_KEY);
console.log('\n✨ 清理完成');
console.log(` • 合并: ${merged}`);
console.log(` • 删除: ${deleted}`);
console.log(` • 保留(未改名): ${kept}`);
console.log(` • 家庭角色标签剩余: ${stats.tag_count}`);
console.log(` • 妈妈/爸爸/爷爷/外公/外婆 计数: ${stats.mom_count}/${stats.dad_count}/${stats.grandpa_count}/${stats.mgp_count}/${stats.mgm_count}`);
db.close();
} catch (error) {
console.error('❌ 清理失败:', error);
try { db.close(); } catch (_) {}
process.exit(1);
}
}
main();