/** * 清洗3.0 导入脚本 v1.0 * * 业务约束: * 1) 参加指导最想解决 缺失时采用保守推断,标签后缀“(推断)” * 2) 监护人2相关字段不参与建模 * 3) 删除付费能力标签分类 * 4) 全量替换导入 */ const ExcelJS = require('exceljs'); const path = require('path'); const { getDb, initializeDatabase } = require('../db/init'); const EXCEL_FILE = path.join(__dirname, '../清洗3.0.xlsx'); const DB_THEME = 'onion'; const TOTAL_USERS_FALLBACK = 11500; const TAG_CATEGORIES = [ { key: 'basic_info_role', name: '家庭角色', color: '#d97706' }, { key: 'user_age_group', name: '用户年龄段标签', color: '#6366f1' }, { key: 'child_grade', name: '孩子学段标签', color: '#8b5cf6' }, { key: 'family_structure', name: '家庭结构标签', color: '#a78bfa' }, { key: 'education_risk', name: '教育风险标签', color: '#c084fc' }, { key: 'family_support', name: '家庭支持度标签', color: '#ec4899' }, { key: 'urgency', name: '需求紧迫度标签', color: '#f97316' }, { key: 'core_problem', name: '核心问题标签', color: '#06b6d4' }, { key: 'intervention_difficulty', name: '干预难度标签', color: '#0891b2' }, { key: 'conversion_priority', name: '转化优先级标签', color: '#10b981' }, { key: 'channel_adaption', name: '渠道适配标签', color: '#059669' }, { key: 'product_match', name: '产品匹配标签', color: '#f59e0b' }, { key: 'basic_info_education', name: '文化程度', color: '#dc2626' }, { key: 'service_duration', name: '服务周期标签', color: '#7c3aed' } ]; function text(v) { if (v === undefined || v === null) return ''; return String(v).replace(/\s+/g, ' ').trim(); } function parseNumber(v) { if (v === undefined || v === null || v === '') return null; const raw = String(v).replace(/[^\d.\-]/g, ''); if (!raw) return null; const n = Number(raw); return Number.isFinite(n) ? n : null; } function splitMulti(v) { const s = text(v); if (!s) return []; return s .split(/[、,,;;/|]+/) .map((item) => item.trim()) .filter(Boolean); } function normalizeFamilyAtmosphere(v) { const s = text(v); if (!s) return '中立'; const warm = ['和谐', '温暖', '支持', '理解', '亲密', '关心', '融洽', '良好']; const cold = ['冷漠', '疏离', '冷战', '忽视', '回避', '压抑', '隔阂']; const conflict = ['争吵', '冲突', '矛盾', '紧张', '对立', '不和']; const neutral = ['一般', '普通', '还行', '尚可', '平常']; const hit = (dict) => dict.some((k) => s.includes(k)); if (hit(cold) || hit(conflict)) return '冷漠'; if (hit(warm)) return '温暖'; if (hit(neutral)) return '中立'; return '中立'; } function normalizeParentChild(v) { const s = text(v); if (!s) return '中立'; if (/(紧张|疏离|冲突|差|糟)/.test(s)) return '紧张'; if (/(良好|亲密|和谐|较好|很好)/.test(s)) return '良好'; return '中立'; } function normalizeRole(v) { const s = text(v); if (!s) return ''; if (/(妈妈|母亲|妈咪)/.test(s)) return '妈妈'; if (/(爸爸|父亲)/.test(s)) return '父亲'; if (/(奶奶|祖母)/.test(s)) return '奶奶'; if (/(爷爷|祖父)/.test(s)) return '爷爷'; if (/(姥姥|外婆)/.test(s)) return '姥姥/外婆'; return s; } function ageToTag(age) { if (age == null) return ''; if (age < 25) return '25岁以下'; if (age < 35) return '25-34岁'; if (age < 45) return '35-44岁'; if (age < 55) return '45-54岁'; return '55岁及以上'; } function normalizeGrade(v) { const s = text(v); if (!s) return ''; if (/幼/.test(s)) return '幼儿园'; if (/(小|一年级|二年级|三年级|四年级|五年级|六年级)/.test(s)) return '小学'; if (/(初一|初二|初三|初中)/.test(s)) return '初中'; if (/(高一|高二|高三|高中)/.test(s)) return '高中'; if (/(大学|大一|大二|大三|大四)/.test(s)) return '大学'; return s; } function normalizeScore(v) { const s = text(v); if (!s) return '一般'; if (/(优秀|优异|很好|拔尖)/.test(s)) return '优秀'; if (/(良好|较好|不错)/.test(s)) return '良好'; if (/(差|不理想|偏下|落后|薄弱)/.test(s)) return '较差'; return '一般'; } function inferCoreProblem(row) { const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']); const atmosphere = normalizeFamilyAtmosphere(row['家庭氛围']); const relation = normalizeParentChild(row['亲子关系']); const divergence = text(row['家长有无教育分歧']); const negate = text(row['是否经常否定孩子']); const physical = text(row['有无打骂教育']); const majorEvent = text(row['重大影响事件_扩展']); if (score === '较差') return '学习动力与执行(推断)'; if (/(有|是|存在|经常)/.test(negate) || /(有|是|存在|经常)/.test(physical)) { return '教养方式调整(推断)'; } if (atmosphere === '冷漠' || relation === '紧张' || /(有|是|分歧)/.test(divergence)) { return '亲子沟通修复(推断)'; } if (/(离异|变故|创伤|重大)/.test(majorEvent)) { return '情绪与安全感支持(推断)'; } return '阶段性成长支持(推断)'; } function inferEducationRisk(row) { const risk = []; const divergence = text(row['家长有无教育分歧']); const negate = text(row['是否经常否定孩子']); const physical = text(row['有无打骂教育']); const withParents = text(row['孩子是否在父母身边长大']); if (/(有|是|分歧|不一致)/.test(divergence)) risk.push('教育理念分歧'); if (/(有|是|经常|总是)/.test(negate)) risk.push('否定式沟通风险'); if (/(有|是|打|骂|体罚)/.test(physical)) risk.push('惩罚式教育风险'); if (/(否|不在|老人|寄养|留守)/.test(withParents)) risk.push('陪伴不足风险'); return risk; } function inferFamilyStructure(row) { const tags = []; const basic = text(row['家庭基本情况_规范'] || row['家庭基本情况']); const withParents = text(row['孩子是否在父母身边长大']); const caregivers = text(row['还有谁参与孩子的养育']); if (/单亲|离异/.test(basic)) tags.push('单亲家庭'); if (/重组/.test(basic)) tags.push('重组家庭'); if (/三代同堂|隔代|祖/.test(basic) || /爷爷|奶奶|姥姥|外婆|祖/.test(caregivers)) tags.push('隔代参与家庭'); if (/(否|不在|寄养|留守)/.test(withParents)) tags.push('分离养育家庭'); if (!tags.length) tags.push('常规家庭结构'); return tags; } function inferUrgency(row) { const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']); const relation = normalizeParentChild(row['亲子关系']); const physical = text(row['有无打骂教育']); if (score === '较差' || relation === '紧张' || /(有|是|打|骂)/.test(physical)) return '高紧迫度'; if (score === '一般') return '中紧迫度'; return '低紧迫度'; } function inferInterventionDifficulty(row) { let score = 0; const relation = normalizeParentChild(row['亲子关系']); const divergence = text(row['家长有无教育分歧']); const negate = text(row['是否经常否定孩子']); const physical = text(row['有无打骂教育']); if (relation === '紧张') score += 2; if (/(有|是|分歧)/.test(divergence)) score += 1; if (/(有|是|经常)/.test(negate)) score += 1; if (/(有|是|打|骂)/.test(physical)) score += 2; if (score >= 4) return '高干预难度'; if (score >= 2) return '中干预难度'; return '低干预难度'; } function inferConversionPriority(row) { const urgency = inferUrgency(row); const diff = inferInterventionDifficulty(row); if (urgency === '高紧迫度' && diff !== '高干预难度') return '高优先级'; if (urgency === '高紧迫度' || diff === '中干预难度') return '中优先级'; return '低优先级'; } function inferChannelAdaption(row) { const q = text(row['问卷评估']); if (!q) return '标准沟通'; if (/(线上|微信|视频)/.test(q)) return '线上沟通优先'; if (/(线下|到访|面谈)/.test(q)) return '线下面谈优先'; return '标准沟通'; } function inferProductMatch(row) { const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']); const relation = normalizeParentChild(row['亲子关系']); if (score === '较差' && relation === '紧张') return '综合干预方案'; if (score === '较差') return '学习提升方案'; if (relation === '紧张') return '亲子沟通方案'; return '成长支持方案'; } function inferServiceDuration(row) { const urgency = inferUrgency(row); const difficulty = inferInterventionDifficulty(row); if (urgency === '高紧迫度' || difficulty === '高干预难度') return '12周'; if (urgency === '中紧迫度') return '8周'; return '4周'; } function updateTagStats(db) { const totalUsers = db.prepare('SELECT COUNT(*) as n FROM users').get().n || TOTAL_USERS_FALLBACK; const allTags = db.prepare('SELECT id FROM tags').all(); const stmt = db.prepare(` UPDATE tags SET coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id), coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2) WHERE id = ? `); allTags.forEach((t) => stmt.run(totalUsers, t.id)); } async function main() { console.log('\n🚀 清洗3.0 导入流程 v1.0\n'); initializeDatabase(DB_THEME); const db = getDb(DB_THEME); try { db.pragma('foreign_keys = OFF'); console.log('🗑️ 清空旧数据...'); db.prepare('DELETE FROM user_tags').run(); db.prepare('DELETE FROM users').run(); db.prepare('DELETE FROM tags').run(); db.prepare('DELETE FROM tag_categories').run(); const categoryMap = {}; const insertCategoryStmt = db.prepare(` INSERT INTO tag_categories (key, name, color, sort_order) VALUES (?, ?, ?, ?) `); TAG_CATEGORIES.forEach((cat, idx) => { const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx); categoryMap[cat.key] = result.lastInsertRowid; }); console.log(`✅ 已创建 ${TAG_CATEGORIES.length} 个分类(已删除付费能力)`); const workbook = new ExcelJS.Workbook(); await workbook.xlsx.readFile(EXCEL_FILE); const worksheet = workbook.worksheets[0]; console.log(`📖 读取 ${worksheet.name} | 行: ${worksheet.rowCount} | 列: ${worksheet.columnCount}`); const headerRow = worksheet.getRow(1); const headers = {}; headerRow.eachCell((cell, colNumber) => { headers[text(cell.value)] = colNumber; }); const needHeaders = [ '家庭角色', '文化程度', '年龄_数值', '年龄_2_数值', '年级_规范', '学习成绩_规范', '家庭基本情况_规范', '家庭氛围', '亲子关系', '家长有无教育分歧', '是否经常否定孩子', '有无打骂教育', '孩子是否在父母身边长大', '还有谁参与孩子的养育', '重大影响事件_扩展', '参加指导最想解决_扩展', '问卷评估', '文件名称' ]; const missing = needHeaders.filter((h) => !headers[h]); if (missing.length) { throw new Error(`缺少关键表头: ${missing.join(', ')}`); } const insertUserStmt = db.prepare('INSERT INTO users (uid, name, extra_json) VALUES (?, ?, ?)'); const insertTagStmt = db.prepare('INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order) VALUES (?, ?, ?, 0, 0, 0)'); const insertUserTagStmt = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)'); const tagCache = new Map(); function getOrCreateTag(catKey, tagName) { const n = text(tagName); if (!n) return null; const cacheKey = `${catKey}:${n}`; if (tagCache.has(cacheKey)) return tagCache.get(cacheKey); let tag = db.prepare('SELECT id FROM tags WHERE category_id = ? AND name = ?').get(categoryMap[catKey], n); if (!tag) { const key = `${catKey}_${Math.random().toString(36).slice(2, 10)}`; const result = insertTagStmt.run(key, n, categoryMap[catKey]); tag = { id: result.lastInsertRowid }; } tagCache.set(cacheKey, tag.id); return tag.id; } let rowCount = 0; let inserted = 0; let inferredCoreCount = 0; worksheet.eachRow((row, rowNumber) => { if (rowNumber === 1) return; rowCount += 1; const rowObj = {}; for (const [name, idx] of Object.entries(headers)) { rowObj[name] = row.getCell(idx).value; } const role = normalizeRole(rowObj['家庭角色']); if (!role) return; const fileName = text(rowObj['文件名称']); const safeFileName = fileName.replace(/\s+/g, '_').slice(0, 60); const uid = fileName ? `u_${safeFileName}_${rowNumber}` : `u_row_${rowNumber}`; const userExtra = { rowNumber, inferredCore: false, source: 'clean3.0' }; const result = insertUserStmt.run(uid, uid, JSON.stringify(userExtra)); if (!result.changes) return; inserted += 1; const userId = result.lastInsertRowid; const addTag = (catKey, tagName) => { const tagId = getOrCreateTag(catKey, tagName); if (tagId) insertUserTagStmt.run(userId, tagId); }; // 基础标签 addTag('basic_info_role', role); addTag('basic_info_education', text(rowObj['文化程度'])); // 年龄段(监护人1 + 监护人2数值年龄合并,但不使用监护人2其他字段) const age1 = parseNumber(rowObj['年龄_数值']); const age2 = parseNumber(rowObj['年龄_2_数值']); addTag('user_age_group', ageToTag(age1)); addTag('user_age_group', ageToTag(age2)); // 学段 addTag('child_grade', normalizeGrade(rowObj['年级_规范'])); // 家庭结构 inferFamilyStructure(rowObj).forEach((t) => addTag('family_structure', t)); // 教育风险 inferEducationRisk(rowObj).forEach((t) => addTag('education_risk', t)); // 家庭支持度(3类氛围 + 亲子关系) addTag('family_support', `家庭氛围-${normalizeFamilyAtmosphere(rowObj['家庭氛围'])}`); addTag('family_support', `亲子关系-${normalizeParentChild(rowObj['亲子关系'])}`); // 紧迫度、难度、优先级 addTag('urgency', inferUrgency(rowObj)); addTag('intervention_difficulty', inferInterventionDifficulty(rowObj)); addTag('conversion_priority', inferConversionPriority(rowObj)); // 渠道/产品/周期 addTag('channel_adaption', inferChannelAdaption(rowObj)); addTag('product_match', inferProductMatch(rowObj)); addTag('service_duration', inferServiceDuration(rowObj)); // 核心问题:优先原始扩展,否则保守推断 + (推断) const originCore = splitMulti(rowObj['参加指导最想解决_扩展']); if (originCore.length) { originCore.forEach((tag) => addTag('core_problem', tag)); } else { const inferred = inferCoreProblem(rowObj); addTag('core_problem', inferred); inferredCoreCount += 1; } if (rowCount % 500 === 0) { console.log(` ✓ 已处理 ${rowCount} 行`); } }); console.log(`\n✅ 导入用户: ${inserted}`); console.log(`✅ 核心问题推断数: ${inferredCoreCount}`); updateTagStats(db); const stats = db.prepare(` SELECT (SELECT COUNT(*) FROM users) as total_users, (SELECT COUNT(*) FROM tags) as total_tags, (SELECT COUNT(*) FROM tag_categories) as total_categories, (SELECT COUNT(*) FROM user_tags) as total_rels `).get(); console.log('\n📊 结果统计'); console.log(` • 用户数: ${stats.total_users}`); console.log(` • 标签数: ${stats.total_tags}`); console.log(` • 分类数: ${stats.total_categories}`); console.log(` • 关系数: ${stats.total_rels}`); const deletedPayment = db.prepare('SELECT COUNT(*) as n FROM tag_categories WHERE key = ?').get('payment_ability').n; console.log(` • 付费能力分类存在数: ${deletedPayment}`); const inferredTags = db.prepare(` SELECT COUNT(*) as n FROM tags t JOIN tag_categories c ON c.id = t.category_id WHERE c.key = 'core_problem' AND t.name LIKE '%(推断)' `).get().n; console.log(` • 推断核心问题标签种类: ${inferredTags}`); db.pragma('foreign_keys = ON'); db.close(); console.log('\n🎉 清洗3.0导入完成\n'); } catch (error) { console.error('❌ 导入失败:', error.message); console.error(error.stack); try { db.close(); } catch (_) {} process.exit(1); } } main();