Files
onion-dmp/scripts/import-clean-data.js
2026-04-08 14:52:09 +08:00

674 lines
23 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 新数据导入脚本 v3.0
* 基于"清洗1.0.xlsx"的完整标签体系
*
* 标签体系49个标签分为5个维度
* 用法: node scripts/import-clean-data.js
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗1.0.xlsx');
// ════════════════════════════════════════════════════════════════════════════
// 标签分类定义 v3.0 - 49个标签 5个维度
// ════════════════════════════════════════════════════════════════════════════
const TAG_CATEGORIES = [
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第一维度:监护人信息 (19个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'guardian_role',
name: '监护人身份',
color: '#3b82f6',
columns: [1], // A: 家庭角色
type: 'discrete'
},
{
key: 'guardian_education',
name: '文化程度',
color: '#6366f1',
columns: [2], // B: 文化程度
type: 'discrete'
},
{
key: 'guardian_occupation',
name: '职业与经济地位',
color: '#8b5cf6',
columns: [3], // C: 职业
type: 'discrete'
},
{
key: 'guardian_age_group',
name: '监护人年龄段',
color: '#a78bfa',
columns: [4], // D: 年龄
type: 'continuous'
},
{
key: 'second_guardian_role',
name: '第二监护人身份',
color: '#c084fc',
columns: [5], // E: 家庭角色_2
type: 'discrete'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第二维度:孩子信息 (13个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'child_gender',
name: '孩子性别',
color: '#ec4899',
columns: [6], // F: 性别
type: 'discrete'
},
{
key: 'child_grade',
name: '孩子学段',
color: '#f472b6',
columns: [7], // G: 年级
type: 'discrete'
},
{
key: 'child_academic_score',
name: '学习成绩',
color: '#f97316',
columns: [8], // H: 学习成绩
type: 'discrete'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第三维度:家庭环境 (8个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'family_structure',
name: '家庭结构',
color: '#06b6d4',
columns: [9], // I: 家庭基本情况
type: 'keyword_extract',
keywords: ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家']
},
{
key: 'parent_child_relationship',
name: '亲子关系',
color: '#0891b2',
columns: [10], // J: 亲子关系
type: 'text'
},
{
key: 'child_living_with_parents',
name: '与父母同住情况',
color: '#10b981',
columns: [14], // N: 孩子是否在父母身边长大
type: 'yes_no'
},
{
key: 'child_caregivers',
name: '参与养育人员',
color: '#059669',
columns: [15], // O: 还有谁参与孩子的养育
type: 'text'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第四维度:教育风险 (6个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'education_consensus',
name: '教育理念一致性',
color: '#f59e0b',
columns: [11], // K: 家长有无教育分歧
type: 'yes_no'
},
{
key: 'child_negation',
name: '否定孩子情况',
color: '#d97706',
columns: [12], // L: 是否经常否定孩子
type: 'yes_no'
},
{
key: 'physical_punishment',
name: '打骂教育',
color: '#dc2626',
columns: [13], // M: 有无打骂教育
type: 'yes_no'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第五维度:服务方案 (3个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'service_duration',
name: '服务周期',
color: '#7c3aed',
columns: [17], // Q: 天数
type: 'discrete'
}
];
// 值映射与标准化规则
const TAG_MAPPINGS = {
'guardian_role': {
'母亲': '母亲',
'妈妈': '母亲',
'母': '母亲',
'父亲': '父亲',
'爸爸': '父亲',
'奶奶': '祖母',
'祖母': '祖母',
'爷爷': '祖父',
'外婆': '外祖母',
'外公': '外祖父',
'姥姥': '外祖母',
'姥爷': '外祖父',
'舅舅': '其他亲属',
'妻子': '其他亲属',
'大姐': '其他亲属'
},
'guardian_education': {
'初小': '小学',
'小学': '小学',
'初中': '初中',
'中师': '中专',
'中专': '中专',
'高中': '高中',
'大专': '大专',
'大学': '本科',
'本科': '本科',
'大学本科': '本科',
'硕士': '硕士及以上',
'研究生': '硕士及以上',
'在职研究生': '硕士及以上'
},
'child_gender': {
'男': '男孩',
'女': '女孩',
'女、男': '双胞胎'
},
'child_academic_score': {
'优秀': '优秀',
'良好': '良好',
'一般': '一般',
'差': '较差'
},
'child_living_with_parents': {
'是': '是',
'是的': '是',
'在': '是',
'否': '否',
'没有': '否',
'不是': '否'
},
'education_consensus': {
'有': '有分歧',
'是': '有分歧',
'否': '无分歧',
'无': '无分歧',
'没有': '无分歧'
},
'child_negation': {
'是': '是',
'有': '是',
'是的': '是',
'经常': '是',
'否': '否',
'无': '否',
'没有': '否',
'偶尔': '否'
},
'physical_punishment': {
'有': '有',
'是': '有',
'有过': '有',
'偶尔有': '有',
'无': '无',
'没有': '无',
'否': '无',
'基本上没有': '无'
},
'service_duration': {
'60天': '60天课程',
'90天': '90天课程',
'180天': '180天课程'
}
};
// 年龄分组
function getAgeGroup(age) {
if (!age || isNaN(age)) return '年龄未知';
const ageNum = parseInt(age);
if (ageNum < 25) return '25岁以下';
else if (ageNum < 35) return '25-35岁';
else if (ageNum < 45) return '35-45岁';
else if (ageNum < 55) return '45-55岁';
else if (ageNum < 65) return '55-65岁';
else if (ageNum < 75) return '65-75岁';
else return '75岁以上';
}
// 学段分组
function gradeToSegment(grade) {
if (!grade) return '学段未知';
const gradeStr = String(grade).toLowerCase();
if (gradeStr.includes('一') || gradeStr.includes('1年')) return '小学低段(1-3年级)';
if (gradeStr.includes('二') || gradeStr.includes('2年')) return '小学低段(1-3年级)';
if (gradeStr.includes('三') || gradeStr.includes('3年')) return '小学低段(1-3年级)';
if (gradeStr.includes('四') || gradeStr.includes('4年')) return '小学高段(4-6年级)';
if (gradeStr.includes('五') || gradeStr.includes('5年')) return '小学高段(4-6年级)';
if (gradeStr.includes('六') || gradeStr.includes('6年')) return '小学高段(4-6年级)';
if (gradeStr.includes('初一')) return '初中前期(初一初二)';
if (gradeStr.includes('初二') || gradeStr.includes('准初')) return '初中前期(初一初二)';
if (gradeStr.includes('初三') || gradeStr.includes('九年')) return '初中毕业班(初三)';
if (gradeStr.includes('高一')) return '高中前期(高一高二)';
if (gradeStr.includes('高二')) return '高中前期(高一高二)';
if (gradeStr.includes('高三')) return '高中毕业班(高三)';
return '学段未知';
}
// 亲子关系分类
function relationshipQuality(text) {
if (!text) return '未指定';
const lowerText = String(text).toLowerCase();
if (lowerText.includes('良好') || lowerText.includes('好') ||
lowerText.includes('和谐') || lowerText.includes('可以') ||
lowerText.includes('还好') || lowerText.includes('较好') ||
lowerText.includes('还可以')) {
return '亲子关系良好';
}
if (lowerText.includes('一般') || lowerText.includes('还行') ||
lowerText.includes('正常') || lowerText.includes('时好时坏')) {
return '亲子关系一般';
}
if (lowerText.includes('不好') || lowerText.includes('差') ||
lowerText.includes('紧张')) {
return '亲子关系较差';
}
return '亲子关系未评估';
}
async function importCleanData() {
try {
console.log(`\n📂 读取 Excel 文件: ${EXCEL_FILE}`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.getWorksheet(1);
if (!worksheet) {
throw new Error('找不到工作表');
}
console.log(`📊 总行数: ${worksheet.rowCount}`);
const db = getDb('onion');
// 初始化数据库
initializeDatabase('onion');
// 创建所有标签分类
console.log('🏗️ 建立分类体系...');
const categoryMap = {};
for (const cat of TAG_CATEGORIES) {
const result = db.prepare(`
INSERT OR IGNORE INTO tag_categories (key, name, sort_order, color)
VALUES (?, ?, ?, ?)
`).run(cat.key, cat.name, 0, cat.color || '#6366f1');
const catRecord = db.prepare(`
SELECT id FROM tag_categories WHERE key = ?
`).get(cat.key);
categoryMap[cat.key] = catRecord.id;
}
console.log(`✅ 创建了 ${Object.keys(categoryMap).length} 个分类`);
// 处理数据行
let insertedCount = 0;
const insertUserStmt = db.prepare(`
INSERT OR IGNORE INTO users (uid, name, extra_json)
VALUES (?, ?, ?)
`);
const insertUserTagStmt = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
const tagCache = {};
function getOrCreateTag(catKey, tagName) {
if (!tagName || !catKey) return null;
const cacheKey = `${catKey}:${tagName}`;
if (tagCache[cacheKey]) return tagCache[cacheKey];
// 先尝试找系统中是否已经有这个标签
let tag = db.prepare(`
SELECT id FROM tags WHERE category_id = ? AND name = ?
`).get(categoryMap[catKey], tagName);
if (!tag) {
// 如果没有生成一个唯一的key
const tagNameNorm = String(tagName).toLowerCase().trim().replace(/\s+/g, '_');
const hashCode = Array.from(tagNameNorm).reduce((h, c) => ((h << 5) - h) + c.charCodeAt(0), 0) & 0xffffff;
let tagKey = `${catKey}_${hashCode.toString(16)}`;
// 检查key冲突
let counter = 1;
while (db.prepare(`SELECT 1 FROM tags WHERE key = ?`).get(tagKey)) {
tagKey = `${catKey}_${hashCode.toString(16)}_${counter}`;
counter++;
}
db.prepare(`
INSERT INTO tags (key, name, category_id, sort_order)
VALUES (?, ?, ?, ?)
`).run(tagKey, tagName, categoryMap[catKey], 0);
tag = db.prepare(`
SELECT id FROM tags WHERE key = ?
`).get(tagKey);
}
tagCache[cacheKey] = tag?.id;
return tag?.id;
}
// 遍历 Excel 数据行
let rowCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return; // 跳过表头
rowCount++;
const values = row.values || [];
// 提取基本信息
const uid = `user_${rowNumber - 1}`; // 简单的用户ID
const guardianRole = values[1];
const childGrade = values[7];
const childDesc = values[16];
if (!guardianRole) {
console.warn(`⚠️ 行 ${rowNumber} 缺少监护人身份,跳过`);
return;
}
// 构建用户额外数据
const extraData = {
row: rowNumber,
guardianRole: guardianRole,
childGrade: childGrade,
childDescription: childDesc ? String(childDesc).substring(0, 500) : ''
};
// 插入用户
const result = insertUserStmt.run(uid, String(guardianRole), JSON.stringify(extraData));
if (result.changes > 0) {
insertedCount++;
const userId = result.lastInsertRowid;
// 为用户添加标签
addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap);
if (rowCount % 30 === 0) {
console.log(` 📝 已处理 ${rowCount} 行...`);
}
}
});
console.log(`\n✅ 用户导入完成:${insertedCount}`);
// 更新所有标签的覆盖统计
console.log('🔄 更新标签统计...');
updateTagStats(db);
console.log('\n📊 数据统计:');
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories
`).get();
console.log(` • 总用户: ${stats.total_users}`);
console.log(` • 总标签: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
db.close();
console.log('\n🎉 导入流程完成!\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
process.exit(1);
}
}
function addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap) {
// 监护人身份
if (values[1]) {
const role = String(values[1]).trim();
const mapped = TAG_MAPPINGS.guardian_role[role] || role;
const tagId = getOrCreateTag('guardian_role', mapped);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 监护人身份: "${role}" -> "${mapped}" (tagId: ${tagId})`);
}
// 文化程度
if (values[2]) {
const edu = String(values[2]).trim();
const mapped = TAG_MAPPINGS.guardian_education[edu] || edu;
const tagId = getOrCreateTag('guardian_education', mapped);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 文化程度: "${edu}" -> "${mapped}" (tagId: ${tagId})`);
}
// 职业(分类)
if (values[3]) {
const job = String(values[3]).trim().toLowerCase();
let jobCategory = '其他';
// 简单的职业分类
if (job.includes('教师') || job.includes('医生') || job.includes('工程') || job.includes('律师')) {
jobCategory = '专业人士';
} else if (job.includes('工人') || job.includes('工厂')) {
jobCategory = '工人';
} else if (job.includes('农') || job.includes('农民') || job.includes('务农')) {
jobCategory = '农民';
} else if (job.includes('员工') || job.includes('职员') || job.includes('公务') || job.includes('干部')) {
jobCategory = '公司/政府工作人员';
} else if (job.includes('退休') || job.includes('离退休')) {
jobCategory = '退休人士';
} else if (job.includes('个体') || job.includes('自由') || job.includes('经营')) {
jobCategory = '个体户/自由职业';
} else if (job.includes('商业') || job.includes('销售')) {
jobCategory = '销售/商业';
} else if (job.includes('家')) {
jobCategory = '家务';
}
const tagId = getOrCreateTag('guardian_occupation', jobCategory);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 职业: "${job}" -> "${jobCategory}" (tagId: ${tagId})`);
}
// 年龄分组
if (values[4]) {
const ageGroup = getAgeGroup(values[4]);
const tagId = getOrCreateTag('guardian_age_group', ageGroup);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 第二监护人身份
if (values[5]) {
const role2 = String(values[5]).trim();
if (role2 && role2 !== '无' && role2 !== '/') {
const mapped = TAG_MAPPINGS.guardian_role[role2] || role2;
const tagId = getOrCreateTag('second_guardian_role', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 孩子性别
if (values[6]) {
const gender = String(values[6]).trim();
const mapped = TAG_MAPPINGS.child_gender[gender] || gender;
const tagId = getOrCreateTag('child_gender', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 孩子学段
if (values[7]) {
const segment = gradeToSegment(values[7]);
const tagId = getOrCreateTag('child_grade', segment);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 学习成绩
if (values[8]) {
const scoreStr = String(values[8]).trim();
// 处理混合值
const scores = scoreStr.split(/[、,]/).map(s => s.trim()).filter(s => s && !s.includes('null'));
for (const score of scores) {
const mapped = TAG_MAPPINGS.child_academic_score[score] || score;
const tagId = getOrCreateTag('child_academic_score', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 家庭结构(关键词提取)
if (values[9]) {
const familyStr = String(values[9]).trim();
const keywords = ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家', '多代'];
const found = new Set();
for (const kw of keywords) {
if (familyStr.includes(kw) && !found.has(kw)) {
found.add(kw);
const tagId = getOrCreateTag('family_structure', kw);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 如果没有识别任何关键词,用原始值
if (found.size === 0) {
const tagId = getOrCreateTag('family_structure', familyStr.substring(0, 50));
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 亲子关系
if (values[10]) {
const relationship = relationshipQuality(values[10]);
const tagId = getOrCreateTag('parent_child_relationship', relationship);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 教育理念一致性
if (values[11]) {
const consensus = String(values[11]).trim();
const mapped = TAG_MAPPINGS.education_consensus[consensus] || (consensus.includes('有') ? '有分歧' : '无分歧');
const tagId = getOrCreateTag('education_consensus', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 是否否定孩子
if (values[12]) {
const negation = String(values[12]).trim();
const mapped = TAG_MAPPINGS.child_negation[negation] || (negation.includes('是') || negation.includes('有') ? '是' : '否');
const tagId = getOrCreateTag('child_negation', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 打骂教育
if (values[13]) {
const punishment = String(values[13]).trim();
const mapped = TAG_MAPPINGS.physical_punishment[punishment] || (punishment.includes('有') ? '有' : '无');
const tagId = getOrCreateTag('physical_punishment', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 孩子与父母同住
if (values[14]) {
const living = String(values[14]).trim();
// 尝试映射,如果映射失败,尝试关键字匹配
let mapped = TAG_MAPPINGS.child_living_with_parents[living];
if (!mapped) {
// 关键字匹配
if (living.includes('是') && !living.includes('不是')) {
mapped = '是';
} else if (living.includes('否') || living.includes('不是')) {
mapped = '否';
} else {
mapped = '是'; // 默认
}
}
const tagId = getOrCreateTag('child_living_with_parents', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 参与养育人员 - 提取关键信息
if (values[15]) {
const caregiverStr = String(values[15]).trim();
if (caregiverStr && caregiverStr !== '无' && caregiverStr !== '没有') {
// 识别主要的养育者
let caregiver = '其他';
if (caregiverStr.includes('妈妈')) caregiver = '母亲';
else if (caregiverStr.includes('父亲') || caregiverStr.includes('爸爸')) caregiver = '父亲';
else if (caregiverStr.includes('爷爷')) caregiver = '祖父';
else if (caregiverStr.includes('奶奶')) caregiver = '祖母';
else if (caregiverStr.includes('外公')) caregiver = '外祖父';
else if (caregiverStr.includes('外婆')) caregiver = '外祖母';
else if (caregiverStr.includes('祖')) caregiver = '祖父母';
else if (caregiverStr.includes('外')) caregiver = '外祖父母';
const tagId = getOrCreateTag('child_caregivers', caregiver);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 服务周期
if (values[17]) {
const duration = String(values[17]).trim();
const mapped = TAG_MAPPINGS.service_duration[duration] || duration;
const tagId = getOrCreateTag('service_duration', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
function updateTagStats(db) {
const tags = db.prepare(`SELECT id FROM tags`).all();
const totalUsers = db.prepare(`SELECT COUNT(*) as n FROM users`).get().n;
for (const tag of tags) {
const result = db.prepare(`
SELECT COUNT(*) as n FROM user_tags WHERE tag_id = ?
`).get(tag.id);
const coverage = result.n || 0;
const coverageRate = totalUsers > 0 ? (coverage / totalUsers * 100).toFixed(2) : 0;
db.prepare(`
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
`).run(coverage, coverageRate, tag.id);
}
}
importCleanData();