Files
onion-dmp/scripts/import-clean-data-v2.js
2026-04-08 14:52:09 +08:00

383 lines
13 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 新数据导入脚本 v4.0
* 基于"清洗2.0.xlsx"的完整数据导入
*
* 特点:
* - 导入1956行用户数据
* - 直接使用清洗2.0中的预生成标签第17-31列
* - 创建16个标签分类
*
* 用法: node scripts/import-clean-data-v2.js
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗2.0.xlsx');
// ════════════════════════════════════════════════════════════════════════════
// 标签分类定义 - 16个分类
// ════════════════════════════════════════════════════════════════════════════
const TAG_CATEGORIES = [
{
key: 'basic_info_role',
name: '家庭角色',
color: '#d97706'
},
{
key: 'user_age_group',
name: '用户年龄段标签',
color: '#6366f1'
},
{
key: 'child_grade',
name: '孩子学段标签',
color: '#8b5cf6'
},
{
key: 'family_structure',
name: '家庭结构标签',
color: '#a78bfa'
},
{
key: 'education_risk',
name: '教育风险标签',
color: '#c084fc'
},
{
key: 'family_support',
name: '家庭支持度标签',
color: '#ec4899'
},
{
key: 'payment_ability',
name: '付费能力标签',
color: '#f472b6'
},
{
key: 'urgency',
name: '需求紧迫度标签',
color: '#f97316'
},
{
key: 'core_problem',
name: '核心问题标签',
color: '#06b6d4'
},
{
key: 'intervention_difficulty',
name: '干预难度标签',
color: '#0891b2'
},
{
key: 'conversion_priority',
name: '转化优先级标签',
color: '#10b981'
},
{
key: 'channel_adaption',
name: '渠道适配标签',
color: '#059669'
},
{
key: 'product_match',
name: '产品匹配标签',
color: '#f59e0b'
},
{
key: 'basic_info_education',
name: '文化程度',
color: '#dc2626'
},
{
key: 'service_duration',
name: '服务周期标签',
color: '#7c3aed'
}
];
// ════════════════════════════════════════════════════════════════════════════
// 列数据映射清洗2.0.xlsx
// ════════════════════════════════════════════════════════════════════════════
const COLUMN_MAPPING = {
// 基础数据列1-16
family_role: 1, // 家庭角色
education: 2, // 文化程度
profession: 3, // 职业
age: 4, // 年龄
family_role_2: 5, // 家庭角色_2
child_gender: 6, // 性别
child_grade: 7, // 年级
academic_score: 8, // 学习成绩
family_situation: 9, // 家庭基本情况
parent_child_rel: 10, // 亲子关系
education_divergence: 11, // 家长有无教育分歧
negate_child: 12, // 是否经常否定孩子
physical_punishment: 13, // 有无打骂教育
child_with_parents: 14, // 孩子是否在父母身边长大
caregivers: 15, // 还有谁参与孩子的养育
child_situation: 16, // 孩子目前情况的描述
// 预生成标签列17-31
service_days: 17, // 天数(不是标签,是数值)
user_identity: 18, // 用户身份标签
user_age: 19, // 用户年龄段标签
child_grade_tag: 20, // 孩子学段标签
family_struct_tag: 21, // 家庭结构标签
education_risk: 22, // 教育风险标签
family_support: 23, // 家庭支持度标签
payment_ability: 24, // 付费能力标签
urgency: 25, // 需求紧迫度标签
core_problem: 26, // 核心问题标签
intervention_diff: 27, // 干预难度标签
conversion_priority: 28, // 转化优先级标签
channel_adaption: 29, // 渠道适配标签
product_match: 30, // 产品匹配标签
service_duration: 31 // 服务周期标签
};
// ════════════════════════════════════════════════════════════════════════════
// 主程序
// ════════════════════════════════════════════════════════════════════════════
async function main() {
console.log('\n');
console.log('╔════════════════════════════════════════════════════════════════╗');
console.log('║ 📥 清洗2.0.xlsx 数据导入程序 v4.0 ║');
console.log('╚════════════════════════════════════════════════════════════════╝');
console.log('');
try {
// 初始化数据库
console.log('🔧 初始化数据库...');
initializeDatabase();
const db = getDb('onion');
// 清除旧数据
console.log('🗑️ 清除旧数据...');
db.prepare('DELETE FROM user_tags').run();
db.prepare('DELETE FROM users').run();
db.prepare('DELETE FROM tags').run();
db.prepare('DELETE FROM tag_categories').run();
// 创建分类
console.log('📂 创建标签分类...');
const insertCategoryStmt = db.prepare(`
INSERT INTO tag_categories (key, name, color, sort_order)
VALUES (?, ?, ?, ?)
`);
const categoryMap = {};
TAG_CATEGORIES.forEach((cat, idx) => {
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
categoryMap[cat.key] = result.lastInsertRowid;
});
console.log(` ✅ 创建 ${TAG_CATEGORIES.length} 个分类\n`);
// 读取Excel文件
console.log('📖 读取Excel文件...');
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.worksheets[0];
console.log(` • 工作表: ${worksheet.name}`);
console.log(` • 行数: ${worksheet.rowCount}`);
console.log(` • 列数: ${worksheet.columnCount}\n`);
// 准备SQL语句
const insertUserStmt = db.prepare(`
INSERT INTO users (uid, name, extra_json)
VALUES (?, ?, ?)
`);
const insertTagStmt = db.prepare(`
INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
VALUES (?, ?, ?, 0, 0, 0)
`);
const insertUserTagStmt = db.prepare(`
INSERT INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
// 标签缓存
const tagCache = {};
function getOrCreateTag(catKey, tagName) {
if (!tagName || String(tagName).trim() === '') return null;
const normalizedName = String(tagName).trim();
const cacheKey = `${catKey}:${normalizedName}`;
if (tagCache[cacheKey]) {
return tagCache[cacheKey];
}
// 使用name-based lookup
let tag = db.prepare(`
SELECT id FROM tags WHERE category_id = ? AND name = ?
`).get(categoryMap[catKey], normalizedName);
if (!tag) {
const result = insertTagStmt.run(
`${catKey}_${Math.random().toString(36).slice(2)}`,
normalizedName,
categoryMap[catKey]
);
tag = { id: result.lastInsertRowid };
}
tagCache[cacheKey] = tag.id;
return tag.id;
}
// 导入数据
console.log('📝 导入用户数据...\n');
let insertedCount = 0;
let rowCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return; // 跳过标题行
rowCount++;
const values = row.values;
if (!values[COLUMN_MAPPING.family_role]) {
if (rowCount <= 5) {
console.warn(`⚠️ 行 ${rowNumber} 缺少家庭角色,跳过`);
}
return;
}
// 创建用户
const uid = `user_${rowCount}`;
const extraData = {
row: rowNumber,
days: values[COLUMN_MAPPING.service_days] || 0
};
const result = insertUserStmt.run(uid, uid, JSON.stringify(extraData));
if (result.changes > 0) {
insertedCount++;
const userId = result.lastInsertRowid;
// 添加标签:基础信息
const role = values[COLUMN_MAPPING.family_role];
if (role) {
const tagId = getOrCreateTag('basic_info_role', role);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
const education = values[COLUMN_MAPPING.education];
if (education) {
const tagId = getOrCreateTag('basic_info_education', education);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 添加标签预生成标签列列18-31
const tagColumns = [
['user_identity', COLUMN_MAPPING.user_identity],
['user_age_group', COLUMN_MAPPING.user_age],
['child_grade', COLUMN_MAPPING.child_grade_tag],
['family_structure', COLUMN_MAPPING.family_struct_tag],
['education_risk', COLUMN_MAPPING.education_risk],
['family_support', COLUMN_MAPPING.family_support],
['payment_ability', COLUMN_MAPPING.payment_ability],
['urgency', COLUMN_MAPPING.urgency],
['core_problem', COLUMN_MAPPING.core_problem],
['intervention_difficulty', COLUMN_MAPPING.intervention_diff],
['conversion_priority', COLUMN_MAPPING.conversion_priority],
['channel_adaption', COLUMN_MAPPING.channel_adaption],
['product_match', COLUMN_MAPPING.product_match],
['service_duration', COLUMN_MAPPING.service_duration]
];
tagColumns.forEach(([catKey, colIdx]) => {
const tagValue = values[colIdx];
if (tagValue && String(tagValue).trim() !== '') {
const tagId = getOrCreateTag(catKey, tagValue);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
});
if (rowCount % 100 === 0) {
console.log(` ✓ 已处理 ${rowCount} 行...`);
}
}
});
console.log(`\n✅ 用户导入完成:${insertedCount}\n`);
// 更新标签统计
console.log('🔄 更新标签统计...');
updateTagStats(db);
// 显示统计
console.log('\n📊 数据统计:');
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories,
(SELECT COUNT(*) FROM user_tags) as total_relationships
`).get();
console.log(` • 总用户: ${stats.total_users}`);
console.log(` • 总标签: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
console.log(` • 用户-标签关系: ${stats.total_relationships}`);
// 显示分类统计
console.log('\n分类覆盖统计');
const catStats = db.prepare(`
SELECT tc.name, COUNT(t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.id
`).all();
catStats.forEach(stat => {
const coverage = stats.total_users > 0 ? ((stat.user_count || 0) * 100 / stats.total_users).toFixed(1) : 0;
console.log(`${stat.name}: ${stat.tag_count || 0} 标签, ${stat.user_count || 0} 用户 (${coverage}%)`);
});
db.close();
console.log('\n🎉 导入流程完成!\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
process.exit(1);
}
}
function updateTagStats(db) {
const updateStmt = db.prepare(`
UPDATE tags
SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND(
(SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 /
(SELECT COUNT(*) FROM users),
2
)
WHERE id = ?
`);
const allTags = db.prepare('SELECT id FROM tags').all();
allTags.forEach(tag => {
updateStmt.run(tag.id);
});
}
// 执行主程序
main();