Update README and project cleanup

This commit is contained in:
inkling
2026-04-08 14:52:09 +08:00
commit fafd267288
71 changed files with 14865 additions and 0 deletions

View File

@@ -0,0 +1,382 @@
/**
* 新数据导入脚本 v4.0
* 基于"清洗2.0.xlsx"的完整数据导入
*
* 特点:
* - 导入1956行用户数据
* - 直接使用清洗2.0中的预生成标签第17-31列
* - 创建16个标签分类
*
* 用法: node scripts/import-clean-data-v2.js
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗2.0.xlsx');
// ════════════════════════════════════════════════════════════════════════════
// 标签分类定义 - 16个分类
// ════════════════════════════════════════════════════════════════════════════
const TAG_CATEGORIES = [
{
key: 'basic_info_role',
name: '家庭角色',
color: '#d97706'
},
{
key: 'user_age_group',
name: '用户年龄段标签',
color: '#6366f1'
},
{
key: 'child_grade',
name: '孩子学段标签',
color: '#8b5cf6'
},
{
key: 'family_structure',
name: '家庭结构标签',
color: '#a78bfa'
},
{
key: 'education_risk',
name: '教育风险标签',
color: '#c084fc'
},
{
key: 'family_support',
name: '家庭支持度标签',
color: '#ec4899'
},
{
key: 'payment_ability',
name: '付费能力标签',
color: '#f472b6'
},
{
key: 'urgency',
name: '需求紧迫度标签',
color: '#f97316'
},
{
key: 'core_problem',
name: '核心问题标签',
color: '#06b6d4'
},
{
key: 'intervention_difficulty',
name: '干预难度标签',
color: '#0891b2'
},
{
key: 'conversion_priority',
name: '转化优先级标签',
color: '#10b981'
},
{
key: 'channel_adaption',
name: '渠道适配标签',
color: '#059669'
},
{
key: 'product_match',
name: '产品匹配标签',
color: '#f59e0b'
},
{
key: 'basic_info_education',
name: '文化程度',
color: '#dc2626'
},
{
key: 'service_duration',
name: '服务周期标签',
color: '#7c3aed'
}
];
// ════════════════════════════════════════════════════════════════════════════
// 列数据映射清洗2.0.xlsx
// ════════════════════════════════════════════════════════════════════════════
const COLUMN_MAPPING = {
// 基础数据列1-16
family_role: 1, // 家庭角色
education: 2, // 文化程度
profession: 3, // 职业
age: 4, // 年龄
family_role_2: 5, // 家庭角色_2
child_gender: 6, // 性别
child_grade: 7, // 年级
academic_score: 8, // 学习成绩
family_situation: 9, // 家庭基本情况
parent_child_rel: 10, // 亲子关系
education_divergence: 11, // 家长有无教育分歧
negate_child: 12, // 是否经常否定孩子
physical_punishment: 13, // 有无打骂教育
child_with_parents: 14, // 孩子是否在父母身边长大
caregivers: 15, // 还有谁参与孩子的养育
child_situation: 16, // 孩子目前情况的描述
// 预生成标签列17-31
service_days: 17, // 天数(不是标签,是数值)
user_identity: 18, // 用户身份标签
user_age: 19, // 用户年龄段标签
child_grade_tag: 20, // 孩子学段标签
family_struct_tag: 21, // 家庭结构标签
education_risk: 22, // 教育风险标签
family_support: 23, // 家庭支持度标签
payment_ability: 24, // 付费能力标签
urgency: 25, // 需求紧迫度标签
core_problem: 26, // 核心问题标签
intervention_diff: 27, // 干预难度标签
conversion_priority: 28, // 转化优先级标签
channel_adaption: 29, // 渠道适配标签
product_match: 30, // 产品匹配标签
service_duration: 31 // 服务周期标签
};
// ════════════════════════════════════════════════════════════════════════════
// 主程序
// ════════════════════════════════════════════════════════════════════════════
async function main() {
console.log('\n');
console.log('╔════════════════════════════════════════════════════════════════╗');
console.log('║ 📥 清洗2.0.xlsx 数据导入程序 v4.0 ║');
console.log('╚════════════════════════════════════════════════════════════════╝');
console.log('');
try {
// 初始化数据库
console.log('🔧 初始化数据库...');
initializeDatabase();
const db = getDb('onion');
// 清除旧数据
console.log('🗑️ 清除旧数据...');
db.prepare('DELETE FROM user_tags').run();
db.prepare('DELETE FROM users').run();
db.prepare('DELETE FROM tags').run();
db.prepare('DELETE FROM tag_categories').run();
// 创建分类
console.log('📂 创建标签分类...');
const insertCategoryStmt = db.prepare(`
INSERT INTO tag_categories (key, name, color, sort_order)
VALUES (?, ?, ?, ?)
`);
const categoryMap = {};
TAG_CATEGORIES.forEach((cat, idx) => {
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
categoryMap[cat.key] = result.lastInsertRowid;
});
console.log(` ✅ 创建 ${TAG_CATEGORIES.length} 个分类\n`);
// 读取Excel文件
console.log('📖 读取Excel文件...');
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.worksheets[0];
console.log(` • 工作表: ${worksheet.name}`);
console.log(` • 行数: ${worksheet.rowCount}`);
console.log(` • 列数: ${worksheet.columnCount}\n`);
// 准备SQL语句
const insertUserStmt = db.prepare(`
INSERT INTO users (uid, name, extra_json)
VALUES (?, ?, ?)
`);
const insertTagStmt = db.prepare(`
INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order)
VALUES (?, ?, ?, 0, 0, 0)
`);
const insertUserTagStmt = db.prepare(`
INSERT INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
// 标签缓存
const tagCache = {};
function getOrCreateTag(catKey, tagName) {
if (!tagName || String(tagName).trim() === '') return null;
const normalizedName = String(tagName).trim();
const cacheKey = `${catKey}:${normalizedName}`;
if (tagCache[cacheKey]) {
return tagCache[cacheKey];
}
// 使用name-based lookup
let tag = db.prepare(`
SELECT id FROM tags WHERE category_id = ? AND name = ?
`).get(categoryMap[catKey], normalizedName);
if (!tag) {
const result = insertTagStmt.run(
`${catKey}_${Math.random().toString(36).slice(2)}`,
normalizedName,
categoryMap[catKey]
);
tag = { id: result.lastInsertRowid };
}
tagCache[cacheKey] = tag.id;
return tag.id;
}
// 导入数据
console.log('📝 导入用户数据...\n');
let insertedCount = 0;
let rowCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return; // 跳过标题行
rowCount++;
const values = row.values;
if (!values[COLUMN_MAPPING.family_role]) {
if (rowCount <= 5) {
console.warn(`⚠️ 行 ${rowNumber} 缺少家庭角色,跳过`);
}
return;
}
// 创建用户
const uid = `user_${rowCount}`;
const extraData = {
row: rowNumber,
days: values[COLUMN_MAPPING.service_days] || 0
};
const result = insertUserStmt.run(uid, uid, JSON.stringify(extraData));
if (result.changes > 0) {
insertedCount++;
const userId = result.lastInsertRowid;
// 添加标签:基础信息
const role = values[COLUMN_MAPPING.family_role];
if (role) {
const tagId = getOrCreateTag('basic_info_role', role);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
const education = values[COLUMN_MAPPING.education];
if (education) {
const tagId = getOrCreateTag('basic_info_education', education);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 添加标签预生成标签列列18-31
const tagColumns = [
['user_identity', COLUMN_MAPPING.user_identity],
['user_age_group', COLUMN_MAPPING.user_age],
['child_grade', COLUMN_MAPPING.child_grade_tag],
['family_structure', COLUMN_MAPPING.family_struct_tag],
['education_risk', COLUMN_MAPPING.education_risk],
['family_support', COLUMN_MAPPING.family_support],
['payment_ability', COLUMN_MAPPING.payment_ability],
['urgency', COLUMN_MAPPING.urgency],
['core_problem', COLUMN_MAPPING.core_problem],
['intervention_difficulty', COLUMN_MAPPING.intervention_diff],
['conversion_priority', COLUMN_MAPPING.conversion_priority],
['channel_adaption', COLUMN_MAPPING.channel_adaption],
['product_match', COLUMN_MAPPING.product_match],
['service_duration', COLUMN_MAPPING.service_duration]
];
tagColumns.forEach(([catKey, colIdx]) => {
const tagValue = values[colIdx];
if (tagValue && String(tagValue).trim() !== '') {
const tagId = getOrCreateTag(catKey, tagValue);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
});
if (rowCount % 100 === 0) {
console.log(` ✓ 已处理 ${rowCount} 行...`);
}
}
});
console.log(`\n✅ 用户导入完成:${insertedCount}\n`);
// 更新标签统计
console.log('🔄 更新标签统计...');
updateTagStats(db);
// 显示统计
console.log('\n📊 数据统计:');
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories,
(SELECT COUNT(*) FROM user_tags) as total_relationships
`).get();
console.log(` • 总用户: ${stats.total_users}`);
console.log(` • 总标签: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
console.log(` • 用户-标签关系: ${stats.total_relationships}`);
// 显示分类统计
console.log('\n分类覆盖统计');
const catStats = db.prepare(`
SELECT tc.name, COUNT(t.id) as tag_count, COUNT(DISTINCT ut.user_id) as user_count
FROM tag_categories tc
LEFT JOIN tags t ON tc.id = t.category_id
LEFT JOIN user_tags ut ON t.id = ut.tag_id
GROUP BY tc.id
ORDER BY tc.id
`).all();
catStats.forEach(stat => {
const coverage = stats.total_users > 0 ? ((stat.user_count || 0) * 100 / stats.total_users).toFixed(1) : 0;
console.log(`${stat.name}: ${stat.tag_count || 0} 标签, ${stat.user_count || 0} 用户 (${coverage}%)`);
});
db.close();
console.log('\n🎉 导入流程完成!\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
process.exit(1);
}
}
function updateTagStats(db) {
const updateStmt = db.prepare(`
UPDATE tags
SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND(
(SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 /
(SELECT COUNT(*) FROM users),
2
)
WHERE id = ?
`);
const allTags = db.prepare('SELECT id FROM tags').all();
allTags.forEach(tag => {
updateStmt.run(tag.id);
});
}
// 执行主程序
main();