383 lines
13 KiB
JavaScript
383 lines
13 KiB
JavaScript
/**
|
||
* 新数据导入脚本 v4.0
|
||
* 基于"清洗2.0.xlsx"的完整数据导入
|
||
*
|
||
* 特点:
|
||
* - 导入1956行用户数据
|
||
* - 直接使用清洗2.0中的预生成标签(第17-31列)
|
||
* - 创建16个标签分类
|
||
*
|
||
* 用法: node scripts/import-clean-data-v2.js
|
||
*/
|
||
|
||
const ExcelJS = require('exceljs');
|
||
const path = require('path');
|
||
const { getDb, initializeDatabase } = require('../db/init');
|
||
|
||
const EXCEL_FILE = path.join(__dirname, '../清洗2.0.xlsx');
|
||
|
||
// ════════════════════════════════════════════════════════════════════════════
|
||
// 标签分类定义 - 16个分类
|
||
// ════════════════════════════════════════════════════════════════════════════
|
||
|
||
const TAG_CATEGORIES = [
|
||
{
|
||
key: 'basic_info_role',
|
||
name: '家庭角色',
|
||
color: '#d97706'
|
||
},
|
||
{
|
||
key: 'user_age_group',
|
||
name: '用户年龄段标签',
|
||
color: '#6366f1'
|
||
},
|
||
{
|
||
key: 'child_grade',
|
||
name: '孩子学段标签',
|
||
color: '#8b5cf6'
|
||
},
|
||
{
|
||
key: 'family_structure',
|
||
name: '家庭结构标签',
|
||
color: '#a78bfa'
|
||
},
|
||
{
|
||
key: 'education_risk',
|
||
name: '教育风险标签',
|
||
color: '#c084fc'
|
||
},
|
||
{
|
||
key: 'family_support',
|
||
name: '家庭支持度标签',
|
||
color: '#ec4899'
|
||
},
|
||
{
|
||
key: 'payment_ability',
|
||
name: '付费能力标签',
|
||
color: '#f472b6'
|
||
},
|
||
{
|
||
key: 'urgency',
|
||
name: '需求紧迫度标签',
|
||
color: '#f97316'
|
||
},
|
||
{
|
||
key: 'core_problem',
|
||
name: '核心问题标签',
|
||
color: '#06b6d4'
|
||
},
|
||
{
|
||
key: 'intervention_difficulty',
|
||
name: '干预难度标签',
|
||
color: '#0891b2'
|
||
},
|
||
{
|
||
key: 'conversion_priority',
|
||
name: '转化优先级标签',
|
||
color: '#10b981'
|
||
},
|
||
{
|
||
key: 'channel_adaption',
|
||
name: '渠道适配标签',
|
||
color: '#059669'
|
||
},
|
||
{
|
||
key: 'product_match',
|
||
name: '产品匹配标签',
|
||
color: '#f59e0b'
|
||
},
|
||
{
|
||
key: 'basic_info_education',
|
||
name: '文化程度',
|
||
color: '#dc2626'
|
||
},
|
||
{
|
||
key: 'service_duration',
|
||
name: '服务周期标签',
|
||
color: '#7c3aed'
|
||
}
|
||
];
|
||
|
||
// ════════════════════════════════════════════════════════════════════════════
|
||
// 列数据映射(清洗2.0.xlsx)
|
||
// ════════════════════════════════════════════════════════════════════════════
|
||
|
||
const COLUMN_MAPPING = {
|
||
// 基础数据(列1-16)
|
||
family_role: 1, // 家庭角色
|
||
education: 2, // 文化程度
|
||
profession: 3, // 职业
|
||
age: 4, // 年龄
|
||
family_role_2: 5, // 家庭角色_2
|
||
child_gender: 6, // 性别
|
||
child_grade: 7, // 年级
|
||
academic_score: 8, // 学习成绩
|
||
family_situation: 9, // 家庭基本情况
|
||
parent_child_rel: 10, // 亲子关系
|
||
education_divergence: 11, // 家长有无教育分歧
|
||
negate_child: 12, // 是否经常否定孩子
|
||
physical_punishment: 13, // 有无打骂教育
|
||
child_with_parents: 14, // 孩子是否在父母身边长大
|
||
caregivers: 15, // 还有谁参与孩子的养育
|
||
child_situation: 16, // 孩子目前情况的描述
|
||
|
||
// 预生成标签(列17-31)
|
||
service_days: 17, // 天数(不是标签,是数值)
|
||
user_identity: 18, // 用户身份标签
|
||
user_age: 19, // 用户年龄段标签
|
||
child_grade_tag: 20, // 孩子学段标签
|
||
family_struct_tag: 21, // 家庭结构标签
|
||
education_risk: 22, // 教育风险标签
|
||
family_support: 23, // 家庭支持度标签
|
||
payment_ability: 24, // 付费能力标签
|
||
urgency: 25, // 需求紧迫度标签
|
||
core_problem: 26, // 核心问题标签
|
||
intervention_diff: 27, // 干预难度标签
|
||
conversion_priority: 28, // 转化优先级标签
|
||
channel_adaption: 29, // 渠道适配标签
|
||
product_match: 30, // 产品匹配标签
|
||
service_duration: 31 // 服务周期标签
|
||
};
|
||
|
||
// ════════════════════════════════════════════════════════════════════════════
|
||
// 主程序
|
||
// ════════════════════════════════════════════════════════════════════════════
|
||
|
||
async function main() {
|
||
console.log('\n');
|
||
console.log('╔════════════════════════════════════════════════════════════════╗');
|
||
console.log('║ 📥 清洗2.0.xlsx 数据导入程序 v4.0 ║');
|
||
console.log('╚════════════════════════════════════════════════════════════════╝');
|
||
console.log('');
|
||
|
||
try {
|
||
// 初始化数据库
|
||
console.log('🔧 初始化数据库...');
|
||
initializeDatabase();
|
||
const db = getDb('onion');
|
||
|
||
// 清除旧数据
|
||
console.log('🗑️ 清除旧数据...');
|
||
db.prepare('DELETE FROM user_tags').run();
|
||
db.prepare('DELETE FROM users').run();
|
||
db.prepare('DELETE FROM tags').run();
|
||
db.prepare('DELETE FROM tag_categories').run();
|
||
|
||
// 创建分类
|
||
console.log('📂 创建标签分类...');
|
||
const insertCategoryStmt = db.prepare(`
|
||
INSERT INTO tag_categories (key, name, color, sort_order)
|
||
VALUES (?, ?, ?, ?)
|
||
`);
|
||
|
||
const categoryMap = {};
|
||
TAG_CATEGORIES.forEach((cat, idx) => {
|
||
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
|
||
categoryMap[cat.key] = result.lastInsertRowid;
|
||
});
|
||
|
||
console.log(` ✅ 创建 ${TAG_CATEGORIES.length} 个分类\n`);
|
||
|
||
// 读取Excel文件
|
||
console.log('📖 读取Excel文件...');
|
||
const workbook = new ExcelJS.Workbook();
|
||
await workbook.xlsx.readFile(EXCEL_FILE);
|
||
const worksheet = workbook.worksheets[0];
|
||
|
||
console.log(` • 工作表: ${worksheet.name}`);
|
||
console.log(` • 行数: ${worksheet.rowCount}`);
|
||
console.log(` • 列数: ${worksheet.columnCount}\n`);
|
||
|
||
// 准备SQL语句
|
||
const insertUserStmt = db.prepare(`
|
||
INSERT INTO users (uid, name, extra_json)
|
||
VALUES (?, ?, ?)
|
||
`);
|
||
|
||
const insertTagStmt = db.prepare(`
|
||
INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
|
||
VALUES (?, ?, ?, 0, 0, 0)
|
||
`);
|
||
|
||
const insertUserTagStmt = db.prepare(`
|
||
INSERT INTO user_tags (user_id, tag_id)
|
||
VALUES (?, ?)
|
||
`);
|
||
|
||
// 标签缓存
|
||
const tagCache = {};
|
||
|
||
function getOrCreateTag(catKey, tagName) {
|
||
if (!tagName || String(tagName).trim() === '') return null;
|
||
|
||
const normalizedName = String(tagName).trim();
|
||
const cacheKey = `${catKey}:${normalizedName}`;
|
||
|
||
if (tagCache[cacheKey]) {
|
||
return tagCache[cacheKey];
|
||
}
|
||
|
||
// 使用name-based lookup
|
||
let tag = db.prepare(`
|
||
SELECT id FROM tags WHERE category_id = ? AND name = ?
|
||
`).get(categoryMap[catKey], normalizedName);
|
||
|
||
if (!tag) {
|
||
const result = insertTagStmt.run(
|
||
`${catKey}_${Math.random().toString(36).slice(2)}`,
|
||
normalizedName,
|
||
categoryMap[catKey]
|
||
);
|
||
tag = { id: result.lastInsertRowid };
|
||
}
|
||
|
||
tagCache[cacheKey] = tag.id;
|
||
return tag.id;
|
||
}
|
||
|
||
// 导入数据
|
||
console.log('📝 导入用户数据...\n');
|
||
let insertedCount = 0;
|
||
let rowCount = 0;
|
||
|
||
worksheet.eachRow((row, rowNumber) => {
|
||
if (rowNumber === 1) return; // 跳过标题行
|
||
|
||
rowCount++;
|
||
const values = row.values;
|
||
|
||
if (!values[COLUMN_MAPPING.family_role]) {
|
||
if (rowCount <= 5) {
|
||
console.warn(`⚠️ 行 ${rowNumber} 缺少家庭角色,跳过`);
|
||
}
|
||
return;
|
||
}
|
||
|
||
// 创建用户
|
||
const uid = `user_${rowCount}`;
|
||
const extraData = {
|
||
row: rowNumber,
|
||
days: values[COLUMN_MAPPING.service_days] || 0
|
||
};
|
||
|
||
const result = insertUserStmt.run(uid, uid, JSON.stringify(extraData));
|
||
|
||
if (result.changes > 0) {
|
||
insertedCount++;
|
||
const userId = result.lastInsertRowid;
|
||
|
||
// 添加标签:基础信息
|
||
const role = values[COLUMN_MAPPING.family_role];
|
||
if (role) {
|
||
const tagId = getOrCreateTag('basic_info_role', role);
|
||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||
}
|
||
|
||
const education = values[COLUMN_MAPPING.education];
|
||
if (education) {
|
||
const tagId = getOrCreateTag('basic_info_education', education);
|
||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||
}
|
||
|
||
// 添加标签:预生成标签列(列18-31)
|
||
const tagColumns = [
|
||
['user_identity', COLUMN_MAPPING.user_identity],
|
||
['user_age_group', COLUMN_MAPPING.user_age],
|
||
['child_grade', COLUMN_MAPPING.child_grade_tag],
|
||
['family_structure', COLUMN_MAPPING.family_struct_tag],
|
||
['education_risk', COLUMN_MAPPING.education_risk],
|
||
['family_support', COLUMN_MAPPING.family_support],
|
||
['payment_ability', COLUMN_MAPPING.payment_ability],
|
||
['urgency', COLUMN_MAPPING.urgency],
|
||
['core_problem', COLUMN_MAPPING.core_problem],
|
||
['intervention_difficulty', COLUMN_MAPPING.intervention_diff],
|
||
['conversion_priority', COLUMN_MAPPING.conversion_priority],
|
||
['channel_adaption', COLUMN_MAPPING.channel_adaption],
|
||
['product_match', COLUMN_MAPPING.product_match],
|
||
['service_duration', COLUMN_MAPPING.service_duration]
|
||
];
|
||
|
||
tagColumns.forEach(([catKey, colIdx]) => {
|
||
const tagValue = values[colIdx];
|
||
if (tagValue && String(tagValue).trim() !== '') {
|
||
const tagId = getOrCreateTag(catKey, tagValue);
|
||
if (tagId) insertUserTagStmt.run(userId, tagId);
|
||
}
|
||
});
|
||
|
||
if (rowCount % 100 === 0) {
|
||
console.log(` ✓ 已处理 ${rowCount} 行...`);
|
||
}
|
||
}
|
||
});
|
||
|
||
console.log(`\n✅ 用户导入完成:${insertedCount} 条\n`);
|
||
|
||
// 更新标签统计
|
||
console.log('🔄 更新标签统计...');
|
||
updateTagStats(db);
|
||
|
||
// 显示统计
|
||
console.log('\n📊 数据统计:');
|
||
const stats = db.prepare(`
|
||
SELECT
|
||
(SELECT COUNT(*) FROM users) as total_users,
|
||
(SELECT COUNT(*) FROM tags) as total_tags,
|
||
(SELECT COUNT(*) FROM tag_categories) as total_categories,
|
||
(SELECT COUNT(*) FROM user_tags) as total_relationships
|
||
`).get();
|
||
|
||
console.log(` • 总用户: ${stats.total_users}`);
|
||
console.log(` • 总标签: ${stats.total_tags}`);
|
||
console.log(` • 分类数: ${stats.total_categories}`);
|
||
console.log(` • 用户-标签关系: ${stats.total_relationships}`);
|
||
|
||
// 显示分类统计
|
||
console.log('\n分类覆盖统计:');
|
||
const catStats = db.prepare(`
|
||
SELECT tc.name, COUNT(t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
|
||
FROM tag_categories tc
|
||
LEFT JOIN tags t ON tc.id = t.category_id
|
||
LEFT JOIN user_tags ut ON t.id = ut.tag_id
|
||
GROUP BY tc.id
|
||
ORDER BY tc.id
|
||
`).all();
|
||
|
||
catStats.forEach(stat => {
|
||
const coverage = stats.total_users > 0 ? ((stat.user_count || 0) * 100 / stats.total_users).toFixed(1) : 0;
|
||
console.log(` • ${stat.name}: ${stat.tag_count || 0} 标签, ${stat.user_count || 0} 用户 (${coverage}%)`);
|
||
});
|
||
|
||
db.close();
|
||
|
||
console.log('\n🎉 导入流程完成!\n');
|
||
|
||
} catch (error) {
|
||
console.error('❌ 导入失败:', error.message);
|
||
console.error(error.stack);
|
||
process.exit(1);
|
||
}
|
||
}
|
||
|
||
function updateTagStats(db) {
|
||
const updateStmt = db.prepare(`
|
||
UPDATE tags
|
||
SET
|
||
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
|
||
coverage_rate = ROUND(
|
||
(SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 /
|
||
(SELECT COUNT(*) FROM users),
|
||
2
|
||
)
|
||
WHERE id = ?
|
||
`);
|
||
|
||
const allTags = db.prepare('SELECT id FROM tags').all();
|
||
allTags.forEach(tag => {
|
||
updateStmt.run(tag.id);
|
||
});
|
||
}
|
||
|
||
// 执行主程序
|
||
main();
|