/** * 新数据导入脚本 v4.0 * 基于"清洗2.0.xlsx"的完整数据导入 * * 特点: * - 导入1956行用户数据 * - 直接使用清洗2.0中的预生成标签(第17-31列) * - 创建16个标签分类 * * 用法: node scripts/import-clean-data-v2.js */ const ExcelJS = require('exceljs'); const path = require('path'); const { getDb, initializeDatabase } = require('../db/init'); const EXCEL_FILE = path.join(__dirname, '../清洗2.0.xlsx'); // ════════════════════════════════════════════════════════════════════════════ // 标签分类定义 - 16个分类 // ════════════════════════════════════════════════════════════════════════════ const TAG_CATEGORIES = [ { key: 'basic_info_role', name: '家庭角色', color: '#d97706' }, { key: 'user_age_group', name: '用户年龄段标签', color: '#6366f1' }, { key: 'child_grade', name: '孩子学段标签', color: '#8b5cf6' }, { key: 'family_structure', name: '家庭结构标签', color: '#a78bfa' }, { key: 'education_risk', name: '教育风险标签', color: '#c084fc' }, { key: 'family_support', name: '家庭支持度标签', color: '#ec4899' }, { key: 'payment_ability', name: '付费能力标签', color: '#f472b6' }, { key: 'urgency', name: '需求紧迫度标签', color: '#f97316' }, { key: 'core_problem', name: '核心问题标签', color: '#06b6d4' }, { key: 'intervention_difficulty', name: '干预难度标签', color: '#0891b2' }, { key: 'conversion_priority', name: '转化优先级标签', color: '#10b981' }, { key: 'channel_adaption', name: '渠道适配标签', color: '#059669' }, { key: 'product_match', name: '产品匹配标签', color: '#f59e0b' }, { key: 'basic_info_education', name: '文化程度', color: '#dc2626' }, { key: 'service_duration', name: '服务周期标签', color: '#7c3aed' } ]; // ════════════════════════════════════════════════════════════════════════════ // 列数据映射(清洗2.0.xlsx) // ════════════════════════════════════════════════════════════════════════════ const COLUMN_MAPPING = { // 基础数据(列1-16) family_role: 1, // 家庭角色 education: 2, // 文化程度 profession: 3, // 职业 age: 4, // 年龄 family_role_2: 5, // 家庭角色_2 child_gender: 6, // 性别 child_grade: 7, // 年级 academic_score: 8, // 学习成绩 family_situation: 9, // 家庭基本情况 parent_child_rel: 10, // 亲子关系 education_divergence: 11, // 家长有无教育分歧 negate_child: 12, // 是否经常否定孩子 physical_punishment: 13, // 有无打骂教育 child_with_parents: 14, // 孩子是否在父母身边长大 caregivers: 15, // 还有谁参与孩子的养育 child_situation: 16, // 孩子目前情况的描述 // 预生成标签(列17-31) service_days: 17, // 天数(不是标签,是数值) user_identity: 18, // 用户身份标签 user_age: 19, // 用户年龄段标签 child_grade_tag: 20, // 孩子学段标签 family_struct_tag: 21, // 家庭结构标签 education_risk: 22, // 教育风险标签 family_support: 23, // 家庭支持度标签 payment_ability: 24, // 付费能力标签 urgency: 25, // 需求紧迫度标签 core_problem: 26, // 核心问题标签 intervention_diff: 27, // 干预难度标签 conversion_priority: 28, // 转化优先级标签 channel_adaption: 29, // 渠道适配标签 product_match: 30, // 产品匹配标签 service_duration: 31 // 服务周期标签 }; // ════════════════════════════════════════════════════════════════════════════ // 主程序 // ════════════════════════════════════════════════════════════════════════════ async function main() { console.log('\n'); console.log('╔════════════════════════════════════════════════════════════════╗'); console.log('║ 📥 清洗2.0.xlsx 数据导入程序 v4.0 ║'); console.log('╚════════════════════════════════════════════════════════════════╝'); console.log(''); try { // 初始化数据库 console.log('🔧 初始化数据库...'); initializeDatabase(); const db = getDb('onion'); // 清除旧数据 console.log('🗑️ 清除旧数据...'); db.prepare('DELETE FROM user_tags').run(); db.prepare('DELETE FROM users').run(); db.prepare('DELETE FROM tags').run(); db.prepare('DELETE FROM tag_categories').run(); // 创建分类 console.log('📂 创建标签分类...'); const insertCategoryStmt = db.prepare(` INSERT INTO tag_categories (key, name, color, sort_order) VALUES (?, ?, ?, ?) `); const categoryMap = {}; TAG_CATEGORIES.forEach((cat, idx) => { const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx); categoryMap[cat.key] = result.lastInsertRowid; }); console.log(` ✅ 创建 ${TAG_CATEGORIES.length} 个分类\n`); // 读取Excel文件 console.log('📖 读取Excel文件...'); const workbook = new ExcelJS.Workbook(); await workbook.xlsx.readFile(EXCEL_FILE); const worksheet = workbook.worksheets[0]; console.log(` • 工作表: ${worksheet.name}`); console.log(` • 行数: ${worksheet.rowCount}`); console.log(` • 列数: ${worksheet.columnCount}\n`); // 准备SQL语句 const insertUserStmt = db.prepare(` INSERT INTO users (uid, name, extra_json) VALUES (?, ?, ?) `); const insertTagStmt = db.prepare(` INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order) VALUES (?, ?, ?, 0, 0, 0) `); const insertUserTagStmt = db.prepare(` INSERT INTO user_tags (user_id, tag_id) VALUES (?, ?) `); // 标签缓存 const tagCache = {}; function getOrCreateTag(catKey, tagName) { if (!tagName || String(tagName).trim() === '') return null; const normalizedName = String(tagName).trim(); const cacheKey = `${catKey}:${normalizedName}`; if (tagCache[cacheKey]) { return tagCache[cacheKey]; } // 使用name-based lookup let tag = db.prepare(` SELECT id FROM tags WHERE category_id = ? AND name = ? `).get(categoryMap[catKey], normalizedName); if (!tag) { const result = insertTagStmt.run( `${catKey}_${Math.random().toString(36).slice(2)}`, normalizedName, categoryMap[catKey] ); tag = { id: result.lastInsertRowid }; } tagCache[cacheKey] = tag.id; return tag.id; } // 导入数据 console.log('📝 导入用户数据...\n'); let insertedCount = 0; let rowCount = 0; worksheet.eachRow((row, rowNumber) => { if (rowNumber === 1) return; // 跳过标题行 rowCount++; const values = row.values; if (!values[COLUMN_MAPPING.family_role]) { if (rowCount <= 5) { console.warn(`⚠️ 行 ${rowNumber} 缺少家庭角色,跳过`); } return; } // 创建用户 const uid = `user_${rowCount}`; const extraData = { row: rowNumber, days: values[COLUMN_MAPPING.service_days] || 0 }; const result = insertUserStmt.run(uid, uid, JSON.stringify(extraData)); if (result.changes > 0) { insertedCount++; const userId = result.lastInsertRowid; // 添加标签:基础信息 const role = values[COLUMN_MAPPING.family_role]; if (role) { const tagId = getOrCreateTag('basic_info_role', role); if (tagId) insertUserTagStmt.run(userId, tagId); } const education = values[COLUMN_MAPPING.education]; if (education) { const tagId = getOrCreateTag('basic_info_education', education); if (tagId) insertUserTagStmt.run(userId, tagId); } // 添加标签:预生成标签列(列18-31) const tagColumns = [ ['user_identity', COLUMN_MAPPING.user_identity], ['user_age_group', COLUMN_MAPPING.user_age], ['child_grade', COLUMN_MAPPING.child_grade_tag], ['family_structure', COLUMN_MAPPING.family_struct_tag], ['education_risk', COLUMN_MAPPING.education_risk], ['family_support', COLUMN_MAPPING.family_support], ['payment_ability', COLUMN_MAPPING.payment_ability], ['urgency', COLUMN_MAPPING.urgency], ['core_problem', COLUMN_MAPPING.core_problem], ['intervention_difficulty', COLUMN_MAPPING.intervention_diff], ['conversion_priority', COLUMN_MAPPING.conversion_priority], ['channel_adaption', COLUMN_MAPPING.channel_adaption], ['product_match', COLUMN_MAPPING.product_match], ['service_duration', COLUMN_MAPPING.service_duration] ]; tagColumns.forEach(([catKey, colIdx]) => { const tagValue = values[colIdx]; if (tagValue && String(tagValue).trim() !== '') { const tagId = getOrCreateTag(catKey, tagValue); if (tagId) insertUserTagStmt.run(userId, tagId); } }); if (rowCount % 100 === 0) { console.log(` ✓ 已处理 ${rowCount} 行...`); } } }); console.log(`\n✅ 用户导入完成:${insertedCount} 条\n`); // 更新标签统计 console.log('🔄 更新标签统计...'); updateTagStats(db); // 显示统计 console.log('\n📊 数据统计:'); const stats = db.prepare(` SELECT (SELECT COUNT(*) FROM users) as total_users, (SELECT COUNT(*) FROM tags) as total_tags, (SELECT COUNT(*) FROM tag_categories) as total_categories, (SELECT COUNT(*) FROM user_tags) as total_relationships `).get(); console.log(` • 总用户: ${stats.total_users}`); console.log(` • 总标签: ${stats.total_tags}`); console.log(` • 分类数: ${stats.total_categories}`); console.log(` • 用户-标签关系: ${stats.total_relationships}`); // 显示分类统计 console.log('\n分类覆盖统计:'); const catStats = db.prepare(` SELECT tc.name, COUNT(t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count FROM tag_categories tc LEFT JOIN tags t ON tc.id = t.category_id LEFT JOIN user_tags ut ON t.id = ut.tag_id GROUP BY tc.id ORDER BY tc.id `).all(); catStats.forEach(stat => { const coverage = stats.total_users > 0 ? ((stat.user_count || 0) * 100 / stats.total_users).toFixed(1) : 0; console.log(` • ${stat.name}: ${stat.tag_count || 0} 标签, ${stat.user_count || 0} 用户 (${coverage}%)`); }); db.close(); console.log('\n🎉 导入流程完成!\n'); } catch (error) { console.error('❌ 导入失败:', error.message); console.error(error.stack); process.exit(1); } } function updateTagStats(db) { const updateStmt = db.prepare(` UPDATE tags SET coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id), coverage_rate = ROUND( (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / (SELECT COUNT(*) FROM users), 2 ) WHERE id = ? `); const allTags = db.prepare('SELECT id FROM tags').all(); allTags.forEach(tag => { updateStmt.run(tag.id); }); } // 执行主程序 main();