449 lines
16 KiB
JavaScript
449 lines
16 KiB
JavaScript
/**
|
||
* 清洗3.0 导入脚本 v1.0
|
||
*
|
||
* 业务约束:
|
||
* 1) 参加指导最想解决 缺失时采用保守推断,标签后缀“(推断)”
|
||
* 2) 监护人2相关字段不参与建模
|
||
* 3) 删除付费能力标签分类
|
||
* 4) 全量替换导入
|
||
*/
|
||
|
||
const ExcelJS = require('exceljs');
|
||
const path = require('path');
|
||
const { getDb, initializeDatabase } = require('../db/init');
|
||
|
||
const EXCEL_FILE = path.join(__dirname, '../清洗3.0.xlsx');
|
||
const DB_THEME = 'onion';
|
||
const TOTAL_USERS_FALLBACK = 11500;
|
||
|
||
const TAG_CATEGORIES = [
|
||
{ key: 'basic_info_role', name: '家庭角色', color: '#d97706' },
|
||
{ key: 'user_age_group', name: '用户年龄段标签', color: '#6366f1' },
|
||
{ key: 'child_grade', name: '孩子学段标签', color: '#8b5cf6' },
|
||
{ key: 'family_structure', name: '家庭结构标签', color: '#a78bfa' },
|
||
{ key: 'education_risk', name: '教育风险标签', color: '#c084fc' },
|
||
{ key: 'family_support', name: '家庭支持度标签', color: '#ec4899' },
|
||
{ key: 'urgency', name: '需求紧迫度标签', color: '#f97316' },
|
||
{ key: 'core_problem', name: '核心问题标签', color: '#06b6d4' },
|
||
{ key: 'intervention_difficulty', name: '干预难度标签', color: '#0891b2' },
|
||
{ key: 'conversion_priority', name: '转化优先级标签', color: '#10b981' },
|
||
{ key: 'channel_adaption', name: '渠道适配标签', color: '#059669' },
|
||
{ key: 'product_match', name: '产品匹配标签', color: '#f59e0b' },
|
||
{ key: 'basic_info_education', name: '文化程度', color: '#dc2626' },
|
||
{ key: 'service_duration', name: '服务周期标签', color: '#7c3aed' }
|
||
];
|
||
|
||
function text(v) {
|
||
if (v === undefined || v === null) return '';
|
||
return String(v).replace(/\s+/g, ' ').trim();
|
||
}
|
||
|
||
function parseNumber(v) {
|
||
if (v === undefined || v === null || v === '') return null;
|
||
const raw = String(v).replace(/[^\d.\-]/g, '');
|
||
if (!raw) return null;
|
||
const n = Number(raw);
|
||
return Number.isFinite(n) ? n : null;
|
||
}
|
||
|
||
function splitMulti(v) {
|
||
const s = text(v);
|
||
if (!s) return [];
|
||
return s
|
||
.split(/[、,,;;/|]+/)
|
||
.map((item) => item.trim())
|
||
.filter(Boolean);
|
||
}
|
||
|
||
function normalizeFamilyAtmosphere(v) {
|
||
const s = text(v);
|
||
if (!s) return '中立';
|
||
|
||
const warm = ['和谐', '温暖', '支持', '理解', '亲密', '关心', '融洽', '良好'];
|
||
const cold = ['冷漠', '疏离', '冷战', '忽视', '回避', '压抑', '隔阂'];
|
||
const conflict = ['争吵', '冲突', '矛盾', '紧张', '对立', '不和'];
|
||
const neutral = ['一般', '普通', '还行', '尚可', '平常'];
|
||
|
||
const hit = (dict) => dict.some((k) => s.includes(k));
|
||
|
||
if (hit(cold) || hit(conflict)) return '冷漠';
|
||
if (hit(warm)) return '温暖';
|
||
if (hit(neutral)) return '中立';
|
||
return '中立';
|
||
}
|
||
|
||
function normalizeParentChild(v) {
|
||
const s = text(v);
|
||
if (!s) return '中立';
|
||
if (/(紧张|疏离|冲突|差|糟)/.test(s)) return '紧张';
|
||
if (/(良好|亲密|和谐|较好|很好)/.test(s)) return '良好';
|
||
return '中立';
|
||
}
|
||
|
||
function normalizeRole(v) {
|
||
const s = text(v);
|
||
if (!s) return '';
|
||
if (/(妈妈|母亲|妈咪)/.test(s)) return '妈妈';
|
||
if (/(爸爸|父亲)/.test(s)) return '父亲';
|
||
if (/(奶奶|祖母)/.test(s)) return '奶奶';
|
||
if (/(爷爷|祖父)/.test(s)) return '爷爷';
|
||
if (/(姥姥|外婆)/.test(s)) return '姥姥/外婆';
|
||
return s;
|
||
}
|
||
|
||
function ageToTag(age) {
|
||
if (age == null) return '';
|
||
if (age < 25) return '25岁以下';
|
||
if (age < 35) return '25-34岁';
|
||
if (age < 45) return '35-44岁';
|
||
if (age < 55) return '45-54岁';
|
||
return '55岁及以上';
|
||
}
|
||
|
||
function normalizeGrade(v) {
|
||
const s = text(v);
|
||
if (!s) return '';
|
||
if (/幼/.test(s)) return '幼儿园';
|
||
if (/(小|一年级|二年级|三年级|四年级|五年级|六年级)/.test(s)) return '小学';
|
||
if (/(初一|初二|初三|初中)/.test(s)) return '初中';
|
||
if (/(高一|高二|高三|高中)/.test(s)) return '高中';
|
||
if (/(大学|大一|大二|大三|大四)/.test(s)) return '大学';
|
||
return s;
|
||
}
|
||
|
||
function normalizeScore(v) {
|
||
const s = text(v);
|
||
if (!s) return '一般';
|
||
if (/(优秀|优异|很好|拔尖)/.test(s)) return '优秀';
|
||
if (/(良好|较好|不错)/.test(s)) return '良好';
|
||
if (/(差|不理想|偏下|落后|薄弱)/.test(s)) return '较差';
|
||
return '一般';
|
||
}
|
||
|
||
function inferCoreProblem(row) {
|
||
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
|
||
const atmosphere = normalizeFamilyAtmosphere(row['家庭氛围']);
|
||
const relation = normalizeParentChild(row['亲子关系']);
|
||
const divergence = text(row['家长有无教育分歧']);
|
||
const negate = text(row['是否经常否定孩子']);
|
||
const physical = text(row['有无打骂教育']);
|
||
const majorEvent = text(row['重大影响事件_扩展']);
|
||
|
||
if (score === '较差') return '学习动力与执行(推断)';
|
||
if (/(有|是|存在|经常)/.test(negate) || /(有|是|存在|经常)/.test(physical)) {
|
||
return '教养方式调整(推断)';
|
||
}
|
||
if (atmosphere === '冷漠' || relation === '紧张' || /(有|是|分歧)/.test(divergence)) {
|
||
return '亲子沟通修复(推断)';
|
||
}
|
||
if (/(离异|变故|创伤|重大)/.test(majorEvent)) {
|
||
return '情绪与安全感支持(推断)';
|
||
}
|
||
return '阶段性成长支持(推断)';
|
||
}
|
||
|
||
function inferEducationRisk(row) {
|
||
const risk = [];
|
||
const divergence = text(row['家长有无教育分歧']);
|
||
const negate = text(row['是否经常否定孩子']);
|
||
const physical = text(row['有无打骂教育']);
|
||
const withParents = text(row['孩子是否在父母身边长大']);
|
||
|
||
if (/(有|是|分歧|不一致)/.test(divergence)) risk.push('教育理念分歧');
|
||
if (/(有|是|经常|总是)/.test(negate)) risk.push('否定式沟通风险');
|
||
if (/(有|是|打|骂|体罚)/.test(physical)) risk.push('惩罚式教育风险');
|
||
if (/(否|不在|老人|寄养|留守)/.test(withParents)) risk.push('陪伴不足风险');
|
||
|
||
return risk;
|
||
}
|
||
|
||
function inferFamilyStructure(row) {
|
||
const tags = [];
|
||
const basic = text(row['家庭基本情况_规范'] || row['家庭基本情况']);
|
||
const withParents = text(row['孩子是否在父母身边长大']);
|
||
const caregivers = text(row['还有谁参与孩子的养育']);
|
||
|
||
if (/单亲|离异/.test(basic)) tags.push('单亲家庭');
|
||
if (/重组/.test(basic)) tags.push('重组家庭');
|
||
if (/三代同堂|隔代|祖/.test(basic) || /爷爷|奶奶|姥姥|外婆|祖/.test(caregivers)) tags.push('隔代参与家庭');
|
||
if (/(否|不在|寄养|留守)/.test(withParents)) tags.push('分离养育家庭');
|
||
if (!tags.length) tags.push('常规家庭结构');
|
||
|
||
return tags;
|
||
}
|
||
|
||
function inferUrgency(row) {
|
||
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
|
||
const relation = normalizeParentChild(row['亲子关系']);
|
||
const physical = text(row['有无打骂教育']);
|
||
|
||
if (score === '较差' || relation === '紧张' || /(有|是|打|骂)/.test(physical)) return '高紧迫度';
|
||
if (score === '一般') return '中紧迫度';
|
||
return '低紧迫度';
|
||
}
|
||
|
||
function inferInterventionDifficulty(row) {
|
||
let score = 0;
|
||
const relation = normalizeParentChild(row['亲子关系']);
|
||
const divergence = text(row['家长有无教育分歧']);
|
||
const negate = text(row['是否经常否定孩子']);
|
||
const physical = text(row['有无打骂教育']);
|
||
|
||
if (relation === '紧张') score += 2;
|
||
if (/(有|是|分歧)/.test(divergence)) score += 1;
|
||
if (/(有|是|经常)/.test(negate)) score += 1;
|
||
if (/(有|是|打|骂)/.test(physical)) score += 2;
|
||
|
||
if (score >= 4) return '高干预难度';
|
||
if (score >= 2) return '中干预难度';
|
||
return '低干预难度';
|
||
}
|
||
|
||
function inferConversionPriority(row) {
|
||
const urgency = inferUrgency(row);
|
||
const diff = inferInterventionDifficulty(row);
|
||
if (urgency === '高紧迫度' && diff !== '高干预难度') return '高优先级';
|
||
if (urgency === '高紧迫度' || diff === '中干预难度') return '中优先级';
|
||
return '低优先级';
|
||
}
|
||
|
||
function inferChannelAdaption(row) {
|
||
const q = text(row['问卷评估']);
|
||
if (!q) return '标准沟通';
|
||
if (/(线上|微信|视频)/.test(q)) return '线上沟通优先';
|
||
if (/(线下|到访|面谈)/.test(q)) return '线下面谈优先';
|
||
return '标准沟通';
|
||
}
|
||
|
||
function inferProductMatch(row) {
|
||
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
|
||
const relation = normalizeParentChild(row['亲子关系']);
|
||
if (score === '较差' && relation === '紧张') return '综合干预方案';
|
||
if (score === '较差') return '学习提升方案';
|
||
if (relation === '紧张') return '亲子沟通方案';
|
||
return '成长支持方案';
|
||
}
|
||
|
||
function inferServiceDuration(row) {
|
||
const urgency = inferUrgency(row);
|
||
const difficulty = inferInterventionDifficulty(row);
|
||
if (urgency === '高紧迫度' || difficulty === '高干预难度') return '12周';
|
||
if (urgency === '中紧迫度') return '8周';
|
||
return '4周';
|
||
}
|
||
|
||
function updateTagStats(db) {
|
||
const totalUsers = db.prepare('SELECT COUNT(*) as n FROM users').get().n || TOTAL_USERS_FALLBACK;
|
||
const allTags = db.prepare('SELECT id FROM tags').all();
|
||
const stmt = db.prepare(`
|
||
UPDATE tags SET
|
||
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
|
||
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
|
||
WHERE id = ?
|
||
`);
|
||
|
||
allTags.forEach((t) => stmt.run(totalUsers, t.id));
|
||
}
|
||
|
||
async function main() {
|
||
console.log('\n🚀 清洗3.0 导入流程 v1.0\n');
|
||
|
||
initializeDatabase(DB_THEME);
|
||
const db = getDb(DB_THEME);
|
||
|
||
try {
|
||
db.pragma('foreign_keys = OFF');
|
||
|
||
console.log('🗑️ 清空旧数据...');
|
||
db.prepare('DELETE FROM user_tags').run();
|
||
db.prepare('DELETE FROM users').run();
|
||
db.prepare('DELETE FROM tags').run();
|
||
db.prepare('DELETE FROM tag_categories').run();
|
||
|
||
const categoryMap = {};
|
||
const insertCategoryStmt = db.prepare(`
|
||
INSERT INTO tag_categories (key, name, color, sort_order)
|
||
VALUES (?, ?, ?, ?)
|
||
`);
|
||
|
||
TAG_CATEGORIES.forEach((cat, idx) => {
|
||
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
|
||
categoryMap[cat.key] = result.lastInsertRowid;
|
||
});
|
||
|
||
console.log(`✅ 已创建 ${TAG_CATEGORIES.length} 个分类(已删除付费能力)`);
|
||
|
||
const workbook = new ExcelJS.Workbook();
|
||
await workbook.xlsx.readFile(EXCEL_FILE);
|
||
const worksheet = workbook.worksheets[0];
|
||
|
||
console.log(`📖 读取 ${worksheet.name} | 行: ${worksheet.rowCount} | 列: ${worksheet.columnCount}`);
|
||
|
||
const headerRow = worksheet.getRow(1);
|
||
const headers = {};
|
||
headerRow.eachCell((cell, colNumber) => {
|
||
headers[text(cell.value)] = colNumber;
|
||
});
|
||
|
||
const needHeaders = [
|
||
'家庭角色', '文化程度', '年龄_数值', '年龄_2_数值', '年级_规范', '学习成绩_规范',
|
||
'家庭基本情况_规范', '家庭氛围', '亲子关系', '家长有无教育分歧', '是否经常否定孩子',
|
||
'有无打骂教育', '孩子是否在父母身边长大', '还有谁参与孩子的养育',
|
||
'重大影响事件_扩展', '参加指导最想解决_扩展', '问卷评估', '文件名称'
|
||
];
|
||
|
||
const missing = needHeaders.filter((h) => !headers[h]);
|
||
if (missing.length) {
|
||
throw new Error(`缺少关键表头: ${missing.join(', ')}`);
|
||
}
|
||
|
||
const insertUserStmt = db.prepare('INSERT INTO users (uid, name, extra_json) VALUES (?, ?, ?)');
|
||
const insertTagStmt = db.prepare('INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order) VALUES (?, ?, ?, 0, 0, 0)');
|
||
const insertUserTagStmt = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
|
||
|
||
const tagCache = new Map();
|
||
|
||
function getOrCreateTag(catKey, tagName) {
|
||
const n = text(tagName);
|
||
if (!n) return null;
|
||
const cacheKey = `${catKey}:${n}`;
|
||
if (tagCache.has(cacheKey)) return tagCache.get(cacheKey);
|
||
|
||
let tag = db.prepare('SELECT id FROM tags WHERE category_id = ? AND name = ?').get(categoryMap[catKey], n);
|
||
if (!tag) {
|
||
const key = `${catKey}_${Math.random().toString(36).slice(2, 10)}`;
|
||
const result = insertTagStmt.run(key, n, categoryMap[catKey]);
|
||
tag = { id: result.lastInsertRowid };
|
||
}
|
||
tagCache.set(cacheKey, tag.id);
|
||
return tag.id;
|
||
}
|
||
|
||
let rowCount = 0;
|
||
let inserted = 0;
|
||
let inferredCoreCount = 0;
|
||
|
||
worksheet.eachRow((row, rowNumber) => {
|
||
if (rowNumber === 1) return;
|
||
rowCount += 1;
|
||
|
||
const rowObj = {};
|
||
for (const [name, idx] of Object.entries(headers)) {
|
||
rowObj[name] = row.getCell(idx).value;
|
||
}
|
||
|
||
const role = normalizeRole(rowObj['家庭角色']);
|
||
if (!role) return;
|
||
|
||
const fileName = text(rowObj['文件名称']);
|
||
const safeFileName = fileName.replace(/\s+/g, '_').slice(0, 60);
|
||
const uid = fileName ? `u_${safeFileName}_${rowNumber}` : `u_row_${rowNumber}`;
|
||
|
||
const userExtra = {
|
||
rowNumber,
|
||
inferredCore: false,
|
||
source: 'clean3.0'
|
||
};
|
||
|
||
const result = insertUserStmt.run(uid, uid, JSON.stringify(userExtra));
|
||
if (!result.changes) return;
|
||
|
||
inserted += 1;
|
||
const userId = result.lastInsertRowid;
|
||
|
||
const addTag = (catKey, tagName) => {
|
||
const tagId = getOrCreateTag(catKey, tagName);
|
||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||
};
|
||
|
||
// 基础标签
|
||
addTag('basic_info_role', role);
|
||
addTag('basic_info_education', text(rowObj['文化程度']));
|
||
|
||
// 年龄段(监护人1 + 监护人2数值年龄合并,但不使用监护人2其他字段)
|
||
const age1 = parseNumber(rowObj['年龄_数值']);
|
||
const age2 = parseNumber(rowObj['年龄_2_数值']);
|
||
addTag('user_age_group', ageToTag(age1));
|
||
addTag('user_age_group', ageToTag(age2));
|
||
|
||
// 学段
|
||
addTag('child_grade', normalizeGrade(rowObj['年级_规范']));
|
||
|
||
// 家庭结构
|
||
inferFamilyStructure(rowObj).forEach((t) => addTag('family_structure', t));
|
||
|
||
// 教育风险
|
||
inferEducationRisk(rowObj).forEach((t) => addTag('education_risk', t));
|
||
|
||
// 家庭支持度(3类氛围 + 亲子关系)
|
||
addTag('family_support', `家庭氛围-${normalizeFamilyAtmosphere(rowObj['家庭氛围'])}`);
|
||
addTag('family_support', `亲子关系-${normalizeParentChild(rowObj['亲子关系'])}`);
|
||
|
||
// 紧迫度、难度、优先级
|
||
addTag('urgency', inferUrgency(rowObj));
|
||
addTag('intervention_difficulty', inferInterventionDifficulty(rowObj));
|
||
addTag('conversion_priority', inferConversionPriority(rowObj));
|
||
|
||
// 渠道/产品/周期
|
||
addTag('channel_adaption', inferChannelAdaption(rowObj));
|
||
addTag('product_match', inferProductMatch(rowObj));
|
||
addTag('service_duration', inferServiceDuration(rowObj));
|
||
|
||
// 核心问题:优先原始扩展,否则保守推断 + (推断)
|
||
const originCore = splitMulti(rowObj['参加指导最想解决_扩展']);
|
||
if (originCore.length) {
|
||
originCore.forEach((tag) => addTag('core_problem', tag));
|
||
} else {
|
||
const inferred = inferCoreProblem(rowObj);
|
||
addTag('core_problem', inferred);
|
||
inferredCoreCount += 1;
|
||
}
|
||
|
||
if (rowCount % 500 === 0) {
|
||
console.log(` ✓ 已处理 ${rowCount} 行`);
|
||
}
|
||
});
|
||
|
||
console.log(`\n✅ 导入用户: ${inserted}`);
|
||
console.log(`✅ 核心问题推断数: ${inferredCoreCount}`);
|
||
|
||
updateTagStats(db);
|
||
|
||
const stats = db.prepare(`
|
||
SELECT
|
||
(SELECT COUNT(*) FROM users) as total_users,
|
||
(SELECT COUNT(*) FROM tags) as total_tags,
|
||
(SELECT COUNT(*) FROM tag_categories) as total_categories,
|
||
(SELECT COUNT(*) FROM user_tags) as total_rels
|
||
`).get();
|
||
|
||
console.log('\n📊 结果统计');
|
||
console.log(` • 用户数: ${stats.total_users}`);
|
||
console.log(` • 标签数: ${stats.total_tags}`);
|
||
console.log(` • 分类数: ${stats.total_categories}`);
|
||
console.log(` • 关系数: ${stats.total_rels}`);
|
||
|
||
const deletedPayment = db.prepare('SELECT COUNT(*) as n FROM tag_categories WHERE key = ?').get('payment_ability').n;
|
||
console.log(` • 付费能力分类存在数: ${deletedPayment}`);
|
||
|
||
const inferredTags = db.prepare(`
|
||
SELECT COUNT(*) as n FROM tags t
|
||
JOIN tag_categories c ON c.id = t.category_id
|
||
WHERE c.key = 'core_problem' AND t.name LIKE '%(推断)'
|
||
`).get().n;
|
||
console.log(` • 推断核心问题标签种类: ${inferredTags}`);
|
||
|
||
db.pragma('foreign_keys = ON');
|
||
db.close();
|
||
|
||
console.log('\n🎉 清洗3.0导入完成\n');
|
||
} catch (error) {
|
||
console.error('❌ 导入失败:', error.message);
|
||
console.error(error.stack);
|
||
try { db.close(); } catch (_) {}
|
||
process.exit(1);
|
||
}
|
||
}
|
||
|
||
main();
|