Files
onion-dmp/scripts/import-clean-data-v3.js
2026-04-08 14:52:09 +08:00

449 lines
16 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 清洗3.0 导入脚本 v1.0
*
* 业务约束:
* 1) 参加指导最想解决 缺失时采用保守推断,标签后缀“(推断)”
* 2) 监护人2相关字段不参与建模
* 3) 删除付费能力标签分类
* 4) 全量替换导入
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗3.0.xlsx');
const DB_THEME = 'onion';
const TOTAL_USERS_FALLBACK = 11500;
const TAG_CATEGORIES = [
{ key: 'basic_info_role', name: '家庭角色', color: '#d97706' },
{ key: 'user_age_group', name: '用户年龄段标签', color: '#6366f1' },
{ key: 'child_grade', name: '孩子学段标签', color: '#8b5cf6' },
{ key: 'family_structure', name: '家庭结构标签', color: '#a78bfa' },
{ key: 'education_risk', name: '教育风险标签', color: '#c084fc' },
{ key: 'family_support', name: '家庭支持度标签', color: '#ec4899' },
{ key: 'urgency', name: '需求紧迫度标签', color: '#f97316' },
{ key: 'core_problem', name: '核心问题标签', color: '#06b6d4' },
{ key: 'intervention_difficulty', name: '干预难度标签', color: '#0891b2' },
{ key: 'conversion_priority', name: '转化优先级标签', color: '#10b981' },
{ key: 'channel_adaption', name: '渠道适配标签', color: '#059669' },
{ key: 'product_match', name: '产品匹配标签', color: '#f59e0b' },
{ key: 'basic_info_education', name: '文化程度', color: '#dc2626' },
{ key: 'service_duration', name: '服务周期标签', color: '#7c3aed' }
];
function text(v) {
if (v === undefined || v === null) return '';
return String(v).replace(/\s+/g, ' ').trim();
}
function parseNumber(v) {
if (v === undefined || v === null || v === '') return null;
const raw = String(v).replace(/[^\d.\-]/g, '');
if (!raw) return null;
const n = Number(raw);
return Number.isFinite(n) ? n : null;
}
function splitMulti(v) {
const s = text(v);
if (!s) return [];
return s
.split(/[、,;/|]+/)
.map((item) => item.trim())
.filter(Boolean);
}
function normalizeFamilyAtmosphere(v) {
const s = text(v);
if (!s) return '中立';
const warm = ['和谐', '温暖', '支持', '理解', '亲密', '关心', '融洽', '良好'];
const cold = ['冷漠', '疏离', '冷战', '忽视', '回避', '压抑', '隔阂'];
const conflict = ['争吵', '冲突', '矛盾', '紧张', '对立', '不和'];
const neutral = ['一般', '普通', '还行', '尚可', '平常'];
const hit = (dict) => dict.some((k) => s.includes(k));
if (hit(cold) || hit(conflict)) return '冷漠';
if (hit(warm)) return '温暖';
if (hit(neutral)) return '中立';
return '中立';
}
function normalizeParentChild(v) {
const s = text(v);
if (!s) return '中立';
if (/(紧张|疏离|冲突|差|糟)/.test(s)) return '紧张';
if (/(良好|亲密|和谐|较好|很好)/.test(s)) return '良好';
return '中立';
}
function normalizeRole(v) {
const s = text(v);
if (!s) return '';
if (/(妈妈|母亲|妈咪)/.test(s)) return '妈妈';
if (/(爸爸|父亲)/.test(s)) return '父亲';
if (/(奶奶|祖母)/.test(s)) return '奶奶';
if (/(爷爷|祖父)/.test(s)) return '爷爷';
if (/(姥姥|外婆)/.test(s)) return '姥姥/外婆';
return s;
}
function ageToTag(age) {
if (age == null) return '';
if (age < 25) return '25岁以下';
if (age < 35) return '25-34岁';
if (age < 45) return '35-44岁';
if (age < 55) return '45-54岁';
return '55岁及以上';
}
function normalizeGrade(v) {
const s = text(v);
if (!s) return '';
if (/幼/.test(s)) return '幼儿园';
if (/(小|一年级|二年级|三年级|四年级|五年级|六年级)/.test(s)) return '小学';
if (/(初一|初二|初三|初中)/.test(s)) return '初中';
if (/(高一|高二|高三|高中)/.test(s)) return '高中';
if (/(大学|大一|大二|大三|大四)/.test(s)) return '大学';
return s;
}
function normalizeScore(v) {
const s = text(v);
if (!s) return '一般';
if (/(优秀|优异|很好|拔尖)/.test(s)) return '优秀';
if (/(良好|较好|不错)/.test(s)) return '良好';
if (/(差|不理想|偏下|落后|薄弱)/.test(s)) return '较差';
return '一般';
}
function inferCoreProblem(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const atmosphere = normalizeFamilyAtmosphere(row['家庭氛围']);
const relation = normalizeParentChild(row['亲子关系']);
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
const majorEvent = text(row['重大影响事件_扩展']);
if (score === '较差') return '学习动力与执行(推断)';
if (/(有|是|存在|经常)/.test(negate) || /(有|是|存在|经常)/.test(physical)) {
return '教养方式调整(推断)';
}
if (atmosphere === '冷漠' || relation === '紧张' || /(有|是|分歧)/.test(divergence)) {
return '亲子沟通修复(推断)';
}
if (/(离异|变故|创伤|重大)/.test(majorEvent)) {
return '情绪与安全感支持(推断)';
}
return '阶段性成长支持(推断)';
}
function inferEducationRisk(row) {
const risk = [];
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
const withParents = text(row['孩子是否在父母身边长大']);
if (/(有|是|分歧|不一致)/.test(divergence)) risk.push('教育理念分歧');
if (/(有|是|经常|总是)/.test(negate)) risk.push('否定式沟通风险');
if (/(有|是|打|骂|体罚)/.test(physical)) risk.push('惩罚式教育风险');
if (/(否|不在|老人|寄养|留守)/.test(withParents)) risk.push('陪伴不足风险');
return risk;
}
function inferFamilyStructure(row) {
const tags = [];
const basic = text(row['家庭基本情况_规范'] || row['家庭基本情况']);
const withParents = text(row['孩子是否在父母身边长大']);
const caregivers = text(row['还有谁参与孩子的养育']);
if (/单亲|离异/.test(basic)) tags.push('单亲家庭');
if (/重组/.test(basic)) tags.push('重组家庭');
if (/三代同堂|隔代|祖/.test(basic) || /爷爷|奶奶|姥姥|外婆|祖/.test(caregivers)) tags.push('隔代参与家庭');
if (/(否|不在|寄养|留守)/.test(withParents)) tags.push('分离养育家庭');
if (!tags.length) tags.push('常规家庭结构');
return tags;
}
function inferUrgency(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const relation = normalizeParentChild(row['亲子关系']);
const physical = text(row['有无打骂教育']);
if (score === '较差' || relation === '紧张' || /(有|是|打|骂)/.test(physical)) return '高紧迫度';
if (score === '一般') return '中紧迫度';
return '低紧迫度';
}
function inferInterventionDifficulty(row) {
let score = 0;
const relation = normalizeParentChild(row['亲子关系']);
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
if (relation === '紧张') score += 2;
if (/(有|是|分歧)/.test(divergence)) score += 1;
if (/(有|是|经常)/.test(negate)) score += 1;
if (/(有|是|打|骂)/.test(physical)) score += 2;
if (score >= 4) return '高干预难度';
if (score >= 2) return '中干预难度';
return '低干预难度';
}
function inferConversionPriority(row) {
const urgency = inferUrgency(row);
const diff = inferInterventionDifficulty(row);
if (urgency === '高紧迫度' && diff !== '高干预难度') return '高优先级';
if (urgency === '高紧迫度' || diff === '中干预难度') return '中优先级';
return '低优先级';
}
function inferChannelAdaption(row) {
const q = text(row['问卷评估']);
if (!q) return '标准沟通';
if (/(线上|微信|视频)/.test(q)) return '线上沟通优先';
if (/(线下|到访|面谈)/.test(q)) return '线下面谈优先';
return '标准沟通';
}
function inferProductMatch(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const relation = normalizeParentChild(row['亲子关系']);
if (score === '较差' && relation === '紧张') return '综合干预方案';
if (score === '较差') return '学习提升方案';
if (relation === '紧张') return '亲子沟通方案';
return '成长支持方案';
}
function inferServiceDuration(row) {
const urgency = inferUrgency(row);
const difficulty = inferInterventionDifficulty(row);
if (urgency === '高紧迫度' || difficulty === '高干预难度') return '12周';
if (urgency === '中紧迫度') return '8周';
return '4周';
}
function updateTagStats(db) {
const totalUsers = db.prepare('SELECT COUNT(*) as n FROM users').get().n || TOTAL_USERS_FALLBACK;
const allTags = db.prepare('SELECT id FROM tags').all();
const stmt = db.prepare(`
UPDATE tags SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
WHERE id = ?
`);
allTags.forEach((t) => stmt.run(totalUsers, t.id));
}
async function main() {
console.log('\n🚀 清洗3.0 导入流程 v1.0\n');
initializeDatabase(DB_THEME);
const db = getDb(DB_THEME);
try {
db.pragma('foreign_keys = OFF');
console.log('🗑️ 清空旧数据...');
db.prepare('DELETE FROM user_tags').run();
db.prepare('DELETE FROM users').run();
db.prepare('DELETE FROM tags').run();
db.prepare('DELETE FROM tag_categories').run();
const categoryMap = {};
const insertCategoryStmt = db.prepare(`
INSERT INTO tag_categories (key, name, color, sort_order)
VALUES (?, ?, ?, ?)
`);
TAG_CATEGORIES.forEach((cat, idx) => {
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
categoryMap[cat.key] = result.lastInsertRowid;
});
console.log(`✅ 已创建 ${TAG_CATEGORIES.length} 个分类(已删除付费能力)`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.worksheets[0];
console.log(`📖 读取 ${worksheet.name} | 行: ${worksheet.rowCount} | 列: ${worksheet.columnCount}`);
const headerRow = worksheet.getRow(1);
const headers = {};
headerRow.eachCell((cell, colNumber) => {
headers[text(cell.value)] = colNumber;
});
const needHeaders = [
'家庭角色', '文化程度', '年龄_数值', '年龄_2_数值', '年级_规范', '学习成绩_规范',
'家庭基本情况_规范', '家庭氛围', '亲子关系', '家长有无教育分歧', '是否经常否定孩子',
'有无打骂教育', '孩子是否在父母身边长大', '还有谁参与孩子的养育',
'重大影响事件_扩展', '参加指导最想解决_扩展', '问卷评估', '文件名称'
];
const missing = needHeaders.filter((h) => !headers[h]);
if (missing.length) {
throw new Error(`缺少关键表头: ${missing.join(', ')}`);
}
const insertUserStmt = db.prepare('INSERT INTO users (uid, name, extra_json) VALUES (?, ?, ?)');
const insertTagStmt = db.prepare('INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order) VALUES (?, ?, ?, 0, 0, 0)');
const insertUserTagStmt = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
const tagCache = new Map();
function getOrCreateTag(catKey, tagName) {
const n = text(tagName);
if (!n) return null;
const cacheKey = `${catKey}:${n}`;
if (tagCache.has(cacheKey)) return tagCache.get(cacheKey);
let tag = db.prepare('SELECT id FROM tags WHERE category_id = ? AND name = ?').get(categoryMap[catKey], n);
if (!tag) {
const key = `${catKey}_${Math.random().toString(36).slice(2, 10)}`;
const result = insertTagStmt.run(key, n, categoryMap[catKey]);
tag = { id: result.lastInsertRowid };
}
tagCache.set(cacheKey, tag.id);
return tag.id;
}
let rowCount = 0;
let inserted = 0;
let inferredCoreCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return;
rowCount += 1;
const rowObj = {};
for (const [name, idx] of Object.entries(headers)) {
rowObj[name] = row.getCell(idx).value;
}
const role = normalizeRole(rowObj['家庭角色']);
if (!role) return;
const fileName = text(rowObj['文件名称']);
const safeFileName = fileName.replace(/\s+/g, '_').slice(0, 60);
const uid = fileName ? `u_${safeFileName}_${rowNumber}` : `u_row_${rowNumber}`;
const userExtra = {
rowNumber,
inferredCore: false,
source: 'clean3.0'
};
const result = insertUserStmt.run(uid, uid, JSON.stringify(userExtra));
if (!result.changes) return;
inserted += 1;
const userId = result.lastInsertRowid;
const addTag = (catKey, tagName) => {
const tagId = getOrCreateTag(catKey, tagName);
if (tagId) insertUserTagStmt.run(userId, tagId);
};
// 基础标签
addTag('basic_info_role', role);
addTag('basic_info_education', text(rowObj['文化程度']));
// 年龄段监护人1 + 监护人2数值年龄合并但不使用监护人2其他字段
const age1 = parseNumber(rowObj['年龄_数值']);
const age2 = parseNumber(rowObj['年龄_2_数值']);
addTag('user_age_group', ageToTag(age1));
addTag('user_age_group', ageToTag(age2));
// 学段
addTag('child_grade', normalizeGrade(rowObj['年级_规范']));
// 家庭结构
inferFamilyStructure(rowObj).forEach((t) => addTag('family_structure', t));
// 教育风险
inferEducationRisk(rowObj).forEach((t) => addTag('education_risk', t));
// 家庭支持度3类氛围 + 亲子关系)
addTag('family_support', `家庭氛围-${normalizeFamilyAtmosphere(rowObj['家庭氛围'])}`);
addTag('family_support', `亲子关系-${normalizeParentChild(rowObj['亲子关系'])}`);
// 紧迫度、难度、优先级
addTag('urgency', inferUrgency(rowObj));
addTag('intervention_difficulty', inferInterventionDifficulty(rowObj));
addTag('conversion_priority', inferConversionPriority(rowObj));
// 渠道/产品/周期
addTag('channel_adaption', inferChannelAdaption(rowObj));
addTag('product_match', inferProductMatch(rowObj));
addTag('service_duration', inferServiceDuration(rowObj));
// 核心问题:优先原始扩展,否则保守推断 + (推断)
const originCore = splitMulti(rowObj['参加指导最想解决_扩展']);
if (originCore.length) {
originCore.forEach((tag) => addTag('core_problem', tag));
} else {
const inferred = inferCoreProblem(rowObj);
addTag('core_problem', inferred);
inferredCoreCount += 1;
}
if (rowCount % 500 === 0) {
console.log(` ✓ 已处理 ${rowCount}`);
}
});
console.log(`\n✅ 导入用户: ${inserted}`);
console.log(`✅ 核心问题推断数: ${inferredCoreCount}`);
updateTagStats(db);
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories,
(SELECT COUNT(*) FROM user_tags) as total_rels
`).get();
console.log('\n📊 结果统计');
console.log(` • 用户数: ${stats.total_users}`);
console.log(` • 标签数: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
console.log(` • 关系数: ${stats.total_rels}`);
const deletedPayment = db.prepare('SELECT COUNT(*) as n FROM tag_categories WHERE key = ?').get('payment_ability').n;
console.log(` • 付费能力分类存在数: ${deletedPayment}`);
const inferredTags = db.prepare(`
SELECT COUNT(*) as n FROM tags t
JOIN tag_categories c ON c.id = t.category_id
WHERE c.key = 'core_problem' AND t.name LIKE '%(推断)'
`).get().n;
console.log(` • 推断核心问题标签种类: ${inferredTags}`);
db.pragma('foreign_keys = ON');
db.close();
console.log('\n🎉 清洗3.0导入完成\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
try { db.close(); } catch (_) {}
process.exit(1);
}
}
main();