/** * 新数据导入脚本 v3.0 * 基于"清洗1.0.xlsx"的完整标签体系 * * 标签体系:49个标签,分为5个维度 * 用法: node scripts/import-clean-data.js */ const ExcelJS = require('exceljs'); const path = require('path'); const { getDb, initializeDatabase } = require('../db/init'); const EXCEL_FILE = path.join(__dirname, '../清洗1.0.xlsx'); // ════════════════════════════════════════════════════════════════════════════ // 标签分类定义 v3.0 - 49个标签 5个维度 // ════════════════════════════════════════════════════════════════════════════ const TAG_CATEGORIES = [ // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ // 第一维度:监护人信息 (19个标签) // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ { key: 'guardian_role', name: '监护人身份', color: '#3b82f6', columns: [1], // A: 家庭角色 type: 'discrete' }, { key: 'guardian_education', name: '文化程度', color: '#6366f1', columns: [2], // B: 文化程度 type: 'discrete' }, { key: 'guardian_occupation', name: '职业与经济地位', color: '#8b5cf6', columns: [3], // C: 职业 type: 'discrete' }, { key: 'guardian_age_group', name: '监护人年龄段', color: '#a78bfa', columns: [4], // D: 年龄 type: 'continuous' }, { key: 'second_guardian_role', name: '第二监护人身份', color: '#c084fc', columns: [5], // E: 家庭角色_2 type: 'discrete' }, // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ // 第二维度:孩子信息 (13个标签) // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ { key: 'child_gender', name: '孩子性别', color: '#ec4899', columns: [6], // F: 性别 type: 'discrete' }, { key: 'child_grade', name: '孩子学段', color: '#f472b6', columns: [7], // G: 年级 type: 'discrete' }, { key: 'child_academic_score', name: '学习成绩', color: '#f97316', columns: [8], // H: 学习成绩 type: 'discrete' }, // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ // 第三维度:家庭环境 (8个标签) // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ { key: 'family_structure', name: '家庭结构', color: '#06b6d4', columns: [9], // I: 家庭基本情况 type: 'keyword_extract', keywords: ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家'] }, { key: 'parent_child_relationship', name: '亲子关系', color: '#0891b2', columns: [10], // J: 亲子关系 type: 'text' }, { key: 'child_living_with_parents', name: '与父母同住情况', color: '#10b981', columns: [14], // N: 孩子是否在父母身边长大 type: 'yes_no' }, { key: 'child_caregivers', name: '参与养育人员', color: '#059669', columns: [15], // O: 还有谁参与孩子的养育 type: 'text' }, // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ // 第四维度:教育风险 (6个标签) // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ { key: 'education_consensus', name: '教育理念一致性', color: '#f59e0b', columns: [11], // K: 家长有无教育分歧 type: 'yes_no' }, { key: 'child_negation', name: '否定孩子情况', color: '#d97706', columns: [12], // L: 是否经常否定孩子 type: 'yes_no' }, { key: 'physical_punishment', name: '打骂教育', color: '#dc2626', columns: [13], // M: 有无打骂教育 type: 'yes_no' }, // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ // 第五维度:服务方案 (3个标签) // ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ { key: 'service_duration', name: '服务周期', color: '#7c3aed', columns: [17], // Q: 天数 type: 'discrete' } ]; // 值映射与标准化规则 const TAG_MAPPINGS = { 'guardian_role': { '母亲': '母亲', '妈妈': '母亲', '母': '母亲', '父亲': '父亲', '爸爸': '父亲', '奶奶': '祖母', '祖母': '祖母', '爷爷': '祖父', '外婆': '外祖母', '外公': '外祖父', '姥姥': '外祖母', '姥爷': '外祖父', '舅舅': '其他亲属', '妻子': '其他亲属', '大姐': '其他亲属' }, 'guardian_education': { '初小': '小学', '小学': '小学', '初中': '初中', '中师': '中专', '中专': '中专', '高中': '高中', '大专': '大专', '大学': '本科', '本科': '本科', '大学本科': '本科', '硕士': '硕士及以上', '研究生': '硕士及以上', '在职研究生': '硕士及以上' }, 'child_gender': { '男': '男孩', '女': '女孩', '女、男': '双胞胎' }, 'child_academic_score': { '优秀': '优秀', '良好': '良好', '一般': '一般', '差': '较差' }, 'child_living_with_parents': { '是': '是', '是的': '是', '在': '是', '否': '否', '没有': '否', '不是': '否' }, 'education_consensus': { '有': '有分歧', '是': '有分歧', '否': '无分歧', '无': '无分歧', '没有': '无分歧' }, 'child_negation': { '是': '是', '有': '是', '是的': '是', '经常': '是', '否': '否', '无': '否', '没有': '否', '偶尔': '否' }, 'physical_punishment': { '有': '有', '是': '有', '有过': '有', '偶尔有': '有', '无': '无', '没有': '无', '否': '无', '基本上没有': '无' }, 'service_duration': { '60天': '60天课程', '90天': '90天课程', '180天': '180天课程' } }; // 年龄分组 function getAgeGroup(age) { if (!age || isNaN(age)) return '年龄未知'; const ageNum = parseInt(age); if (ageNum < 25) return '25岁以下'; else if (ageNum < 35) return '25-35岁'; else if (ageNum < 45) return '35-45岁'; else if (ageNum < 55) return '45-55岁'; else if (ageNum < 65) return '55-65岁'; else if (ageNum < 75) return '65-75岁'; else return '75岁以上'; } // 学段分组 function gradeToSegment(grade) { if (!grade) return '学段未知'; const gradeStr = String(grade).toLowerCase(); if (gradeStr.includes('一') || gradeStr.includes('1年')) return '小学低段(1-3年级)'; if (gradeStr.includes('二') || gradeStr.includes('2年')) return '小学低段(1-3年级)'; if (gradeStr.includes('三') || gradeStr.includes('3年')) return '小学低段(1-3年级)'; if (gradeStr.includes('四') || gradeStr.includes('4年')) return '小学高段(4-6年级)'; if (gradeStr.includes('五') || gradeStr.includes('5年')) return '小学高段(4-6年级)'; if (gradeStr.includes('六') || gradeStr.includes('6年')) return '小学高段(4-6年级)'; if (gradeStr.includes('初一')) return '初中前期(初一初二)'; if (gradeStr.includes('初二') || gradeStr.includes('准初')) return '初中前期(初一初二)'; if (gradeStr.includes('初三') || gradeStr.includes('九年')) return '初中毕业班(初三)'; if (gradeStr.includes('高一')) return '高中前期(高一高二)'; if (gradeStr.includes('高二')) return '高中前期(高一高二)'; if (gradeStr.includes('高三')) return '高中毕业班(高三)'; return '学段未知'; } // 亲子关系分类 function relationshipQuality(text) { if (!text) return '未指定'; const lowerText = String(text).toLowerCase(); if (lowerText.includes('良好') || lowerText.includes('好') || lowerText.includes('和谐') || lowerText.includes('可以') || lowerText.includes('还好') || lowerText.includes('较好') || lowerText.includes('还可以')) { return '亲子关系良好'; } if (lowerText.includes('一般') || lowerText.includes('还行') || lowerText.includes('正常') || lowerText.includes('时好时坏')) { return '亲子关系一般'; } if (lowerText.includes('不好') || lowerText.includes('差') || lowerText.includes('紧张')) { return '亲子关系较差'; } return '亲子关系未评估'; } async function importCleanData() { try { console.log(`\n📂 读取 Excel 文件: ${EXCEL_FILE}`); const workbook = new ExcelJS.Workbook(); await workbook.xlsx.readFile(EXCEL_FILE); const worksheet = workbook.getWorksheet(1); if (!worksheet) { throw new Error('找不到工作表'); } console.log(`📊 总行数: ${worksheet.rowCount}`); const db = getDb('onion'); // 初始化数据库 initializeDatabase('onion'); // 创建所有标签分类 console.log('🏗️ 建立分类体系...'); const categoryMap = {}; for (const cat of TAG_CATEGORIES) { const result = db.prepare(` INSERT OR IGNORE INTO tag_categories (key, name, sort_order, color) VALUES (?, ?, ?, ?) `).run(cat.key, cat.name, 0, cat.color || '#6366f1'); const catRecord = db.prepare(` SELECT id FROM tag_categories WHERE key = ? `).get(cat.key); categoryMap[cat.key] = catRecord.id; } console.log(`✅ 创建了 ${Object.keys(categoryMap).length} 个分类`); // 处理数据行 let insertedCount = 0; const insertUserStmt = db.prepare(` INSERT OR IGNORE INTO users (uid, name, extra_json) VALUES (?, ?, ?) `); const insertUserTagStmt = db.prepare(` INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?) `); const tagCache = {}; function getOrCreateTag(catKey, tagName) { if (!tagName || !catKey) return null; const cacheKey = `${catKey}:${tagName}`; if (tagCache[cacheKey]) return tagCache[cacheKey]; // 先尝试找系统中是否已经有这个标签 let tag = db.prepare(` SELECT id FROM tags WHERE category_id = ? AND name = ? `).get(categoryMap[catKey], tagName); if (!tag) { // 如果没有,生成一个唯一的key const tagNameNorm = String(tagName).toLowerCase().trim().replace(/\s+/g, '_'); const hashCode = Array.from(tagNameNorm).reduce((h, c) => ((h << 5) - h) + c.charCodeAt(0), 0) & 0xffffff; let tagKey = `${catKey}_${hashCode.toString(16)}`; // 检查key冲突 let counter = 1; while (db.prepare(`SELECT 1 FROM tags WHERE key = ?`).get(tagKey)) { tagKey = `${catKey}_${hashCode.toString(16)}_${counter}`; counter++; } db.prepare(` INSERT INTO tags (key, name, category_id, sort_order) VALUES (?, ?, ?, ?) `).run(tagKey, tagName, categoryMap[catKey], 0); tag = db.prepare(` SELECT id FROM tags WHERE key = ? `).get(tagKey); } tagCache[cacheKey] = tag?.id; return tag?.id; } // 遍历 Excel 数据行 let rowCount = 0; worksheet.eachRow((row, rowNumber) => { if (rowNumber === 1) return; // 跳过表头 rowCount++; const values = row.values || []; // 提取基本信息 const uid = `user_${rowNumber - 1}`; // 简单的用户ID const guardianRole = values[1]; const childGrade = values[7]; const childDesc = values[16]; if (!guardianRole) { console.warn(`⚠️ 行 ${rowNumber} 缺少监护人身份,跳过`); return; } // 构建用户额外数据 const extraData = { row: rowNumber, guardianRole: guardianRole, childGrade: childGrade, childDescription: childDesc ? String(childDesc).substring(0, 500) : '' }; // 插入用户 const result = insertUserStmt.run(uid, String(guardianRole), JSON.stringify(extraData)); if (result.changes > 0) { insertedCount++; const userId = result.lastInsertRowid; // 为用户添加标签 addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap); if (rowCount % 30 === 0) { console.log(` 📝 已处理 ${rowCount} 行...`); } } }); console.log(`\n✅ 用户导入完成:${insertedCount} 条`); // 更新所有标签的覆盖统计 console.log('🔄 更新标签统计...'); updateTagStats(db); console.log('\n📊 数据统计:'); const stats = db.prepare(` SELECT (SELECT COUNT(*) FROM users) as total_users, (SELECT COUNT(*) FROM tags) as total_tags, (SELECT COUNT(*) FROM tag_categories) as total_categories `).get(); console.log(` • 总用户: ${stats.total_users}`); console.log(` • 总标签: ${stats.total_tags}`); console.log(` • 分类数: ${stats.total_categories}`); db.close(); console.log('\n🎉 导入流程完成!\n'); } catch (error) { console.error('❌ 导入失败:', error.message); console.error(error.stack); process.exit(1); } } function addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap) { // 监护人身份 if (values[1]) { const role = String(values[1]).trim(); const mapped = TAG_MAPPINGS.guardian_role[role] || role; const tagId = getOrCreateTag('guardian_role', mapped); if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId); if (rowNumber <= 5) console.log(` [行${rowNumber}] 监护人身份: "${role}" -> "${mapped}" (tagId: ${tagId})`); } // 文化程度 if (values[2]) { const edu = String(values[2]).trim(); const mapped = TAG_MAPPINGS.guardian_education[edu] || edu; const tagId = getOrCreateTag('guardian_education', mapped); if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId); if (rowNumber <= 5) console.log(` [行${rowNumber}] 文化程度: "${edu}" -> "${mapped}" (tagId: ${tagId})`); } // 职业(分类) if (values[3]) { const job = String(values[3]).trim().toLowerCase(); let jobCategory = '其他'; // 简单的职业分类 if (job.includes('教师') || job.includes('医生') || job.includes('工程') || job.includes('律师')) { jobCategory = '专业人士'; } else if (job.includes('工人') || job.includes('工厂')) { jobCategory = '工人'; } else if (job.includes('农') || job.includes('农民') || job.includes('务农')) { jobCategory = '农民'; } else if (job.includes('员工') || job.includes('职员') || job.includes('公务') || job.includes('干部')) { jobCategory = '公司/政府工作人员'; } else if (job.includes('退休') || job.includes('离退休')) { jobCategory = '退休人士'; } else if (job.includes('个体') || job.includes('自由') || job.includes('经营')) { jobCategory = '个体户/自由职业'; } else if (job.includes('商业') || job.includes('销售')) { jobCategory = '销售/商业'; } else if (job.includes('家')) { jobCategory = '家务'; } const tagId = getOrCreateTag('guardian_occupation', jobCategory); if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId); if (rowNumber <= 5) console.log(` [行${rowNumber}] 职业: "${job}" -> "${jobCategory}" (tagId: ${tagId})`); } // 年龄分组 if (values[4]) { const ageGroup = getAgeGroup(values[4]); const tagId = getOrCreateTag('guardian_age_group', ageGroup); if (tagId) insertUserTagStmt.run(userId, tagId); } // 第二监护人身份 if (values[5]) { const role2 = String(values[5]).trim(); if (role2 && role2 !== '无' && role2 !== '/') { const mapped = TAG_MAPPINGS.guardian_role[role2] || role2; const tagId = getOrCreateTag('second_guardian_role', mapped); if (tagId) insertUserTagStmt.run(userId, tagId); } } // 孩子性别 if (values[6]) { const gender = String(values[6]).trim(); const mapped = TAG_MAPPINGS.child_gender[gender] || gender; const tagId = getOrCreateTag('child_gender', mapped); if (tagId) insertUserTagStmt.run(userId, tagId); } // 孩子学段 if (values[7]) { const segment = gradeToSegment(values[7]); const tagId = getOrCreateTag('child_grade', segment); if (tagId) insertUserTagStmt.run(userId, tagId); } // 学习成绩 if (values[8]) { const scoreStr = String(values[8]).trim(); // 处理混合值 const scores = scoreStr.split(/[、,]/).map(s => s.trim()).filter(s => s && !s.includes('null')); for (const score of scores) { const mapped = TAG_MAPPINGS.child_academic_score[score] || score; const tagId = getOrCreateTag('child_academic_score', mapped); if (tagId) insertUserTagStmt.run(userId, tagId); } } // 家庭结构(关键词提取) if (values[9]) { const familyStr = String(values[9]).trim(); const keywords = ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家', '多代']; const found = new Set(); for (const kw of keywords) { if (familyStr.includes(kw) && !found.has(kw)) { found.add(kw); const tagId = getOrCreateTag('family_structure', kw); if (tagId) insertUserTagStmt.run(userId, tagId); } } // 如果没有识别任何关键词,用原始值 if (found.size === 0) { const tagId = getOrCreateTag('family_structure', familyStr.substring(0, 50)); if (tagId) insertUserTagStmt.run(userId, tagId); } } // 亲子关系 if (values[10]) { const relationship = relationshipQuality(values[10]); const tagId = getOrCreateTag('parent_child_relationship', relationship); if (tagId) insertUserTagStmt.run(userId, tagId); } // 教育理念一致性 if (values[11]) { const consensus = String(values[11]).trim(); const mapped = TAG_MAPPINGS.education_consensus[consensus] || (consensus.includes('有') ? '有分歧' : '无分歧'); const tagId = getOrCreateTag('education_consensus', mapped); if (tagId) insertUserTagStmt.run(userId, tagId); } // 是否否定孩子 if (values[12]) { const negation = String(values[12]).trim(); const mapped = TAG_MAPPINGS.child_negation[negation] || (negation.includes('是') || negation.includes('有') ? '是' : '否'); const tagId = getOrCreateTag('child_negation', mapped); if (tagId) insertUserTagStmt.run(userId, tagId); } // 打骂教育 if (values[13]) { const punishment = String(values[13]).trim(); const mapped = TAG_MAPPINGS.physical_punishment[punishment] || (punishment.includes('有') ? '有' : '无'); const tagId = getOrCreateTag('physical_punishment', mapped); if (tagId) insertUserTagStmt.run(userId, tagId); } // 孩子与父母同住 if (values[14]) { const living = String(values[14]).trim(); // 尝试映射,如果映射失败,尝试关键字匹配 let mapped = TAG_MAPPINGS.child_living_with_parents[living]; if (!mapped) { // 关键字匹配 if (living.includes('是') && !living.includes('不是')) { mapped = '是'; } else if (living.includes('否') || living.includes('不是')) { mapped = '否'; } else { mapped = '是'; // 默认 } } const tagId = getOrCreateTag('child_living_with_parents', mapped); if (tagId) insertUserTagStmt.run(userId, tagId); } // 参与养育人员 - 提取关键信息 if (values[15]) { const caregiverStr = String(values[15]).trim(); if (caregiverStr && caregiverStr !== '无' && caregiverStr !== '没有') { // 识别主要的养育者 let caregiver = '其他'; if (caregiverStr.includes('妈妈')) caregiver = '母亲'; else if (caregiverStr.includes('父亲') || caregiverStr.includes('爸爸')) caregiver = '父亲'; else if (caregiverStr.includes('爷爷')) caregiver = '祖父'; else if (caregiverStr.includes('奶奶')) caregiver = '祖母'; else if (caregiverStr.includes('外公')) caregiver = '外祖父'; else if (caregiverStr.includes('外婆')) caregiver = '外祖母'; else if (caregiverStr.includes('祖')) caregiver = '祖父母'; else if (caregiverStr.includes('外')) caregiver = '外祖父母'; const tagId = getOrCreateTag('child_caregivers', caregiver); if (tagId) insertUserTagStmt.run(userId, tagId); } } // 服务周期 if (values[17]) { const duration = String(values[17]).trim(); const mapped = TAG_MAPPINGS.service_duration[duration] || duration; const tagId = getOrCreateTag('service_duration', mapped); if (tagId) insertUserTagStmt.run(userId, tagId); } } function updateTagStats(db) { const tags = db.prepare(`SELECT id FROM tags`).all(); const totalUsers = db.prepare(`SELECT COUNT(*) as n FROM users`).get().n; for (const tag of tags) { const result = db.prepare(` SELECT COUNT(*) as n FROM user_tags WHERE tag_id = ? `).get(tag.id); const coverage = result.n || 0; const coverageRate = totalUsers > 0 ? (coverage / totalUsers * 100).toFixed(2) : 0; db.prepare(` UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ? `).run(coverage, coverageRate, tag.id); } } importCleanData();