Files
onion-dmp/db/seed.js
2026-04-08 14:52:09 +08:00

432 lines
18 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 数据种子 — 洋葱客户大数据标签系统 (K12教育/家长画像)
*
* 目标受众:初中、高中学生的家长,关注家庭结构、学业情况、消费能力等。
*/
const { getDb, initializeDatabase } = require('./init');
// =============================================
// 洋葱客户标签体系定义
// =============================================
const TAG_SYSTEM = [
{
key: 'parent_identity', name: '家庭角色', color: '#6366f1',
tags: [
{ key: 'pi_mom', name: '母亲主导', desc: '主要由母亲参与管理' },
{ key: 'pi_dad', name: '父亲主导', desc: '主要由父亲参与管理' },
{ key: 'pi_both', name: '双亲共育', desc: '父母共同活跃参与' },
{ key: 'pi_single', name: '单亲家庭', desc: '系统标识为单亲状态' },
{ key: 'pi_grand', name: '隔代参与', desc: '祖辈有关联操作或代付' },
]
},
{
key: 'city_level', name: '所在城市', color: '#8b5cf6',
tags: [
{ key: 'ct_tier1', name: '一线城市', desc: '北上广深' },
{ key: 'ct_new_tier1', name: '新一线', desc: '杭州/成都/武汉等15城' },
{ key: 'ct_tier2', name: '二线城市', desc: '省会及副省级城市' },
{ key: 'ct_tier3', name: '三线及以下', desc: '地级市及县城下沉市场' },
{ key: 'ct_overseas', name: '海外及港澳台', desc: '非大陆地区访问' },
]
},
{
key: 'income', name: '家庭月收入', color: '#a855f7',
tags: [
{ key: 'inc_high', name: '高收入 (>5w)', desc: '家庭月收入5万以上' },
{ key: 'inc_mid_high', name: '中高 (2w-5w)', desc: '家庭月收入2万至5万' },
{ key: 'inc_mid', name: '中等 (1w-2w)', desc: '家庭月收入1万至2万' },
{ key: 'inc_low', name: '偏低 (<1w)', desc: '家庭月收入1万以下' },
]
},
{
key: 'child_count', name: '子女数量', color: '#ec4899',
tags: [
{ key: 'cc_one', name: '独生子女', desc: '仅有一个孩子注册' },
{ key: 'cc_two', name: '二胎家庭', desc: '绑定两个孩子' },
{ key: 'cc_multi', name: '三胎及以上', desc: '绑定三个及以上孩子' },
{ key: 'cc_cross', name: '跨学段多孩', desc: '多个孩子处于不同学段' },
]
},
{
key: 'child_stage', name: '孩子学段', color: '#f59e0b',
tags: [
{ key: 'cs_mid', name: '初中阶段', desc: '处于初一至初三年级' },
{ key: 'cs_high', name: '高中阶段', desc: '处于高一至高三年级' },
{ key: 'cs_transition', name: '小升初/初升高', desc: '处于升学接轨期' },
]
},
{
key: 'child_grade', name: '具体年级', color: '#f97316',
tags: [
{ key: 'cg_mid1', name: '初一 (7年级)', desc: '初一年级' },
{ key: 'cg_mid2', name: '初二 (8年级)', desc: '初二年级' },
{ key: 'cg_mid3', name: '初三 (9年级)', desc: '中考备战期' },
{ key: 'cg_high1', name: '高一 (10年级)', desc: '高一年级' },
{ key: 'cg_high2', name: '高二 (11年级)', desc: '高二分班/学考期' },
{ key: 'cg_high3', name: '高三 (12年级)', desc: '高考冲刺期' },
]
},
{
key: 'study_pref', name: '学习偏好', color: '#ef4444',
tags: [
{ key: 'sp_top', name: '培优拔高', desc: '注重竞赛、难题突破' },
{ key: 'sp_base', name: '基础巩固', desc: '注重课内知识达标' },
{ key: 'sp_art', name: '艺体生', desc: '艺术/体育专业方向考学' },
{ key: 'sp_abroad', name: '出国留学', desc: '有国际路线意向' },
{ key: 'sp_self', name: '自主探究', desc: '孩子主动学习能力强' },
]
},
{
key: 'subject_weak', name: '薄弱学科', color: '#14b8a6',
tags: [
{ key: 'sw_math', name: '数学薄弱', desc: '数学经常低于平均分' },
{ key: 'sw_english', name: '英语薄弱', desc: '英语单词/听力为短板' },
{ key: 'sw_science', name: '理综薄弱', desc: '物理/化学跨学科困难' },
{ key: 'sw_arts', name: '文综薄弱', desc: '政史地背诵/理解困难' },
{ key: 'sw_chinese', name: '语文薄弱', desc: '阅读理解/作文得分低' },
]
},
{
key: 'school_type', name: '学校类型', color: '#22c55e',
tags: [
{ key: 'st_key', name: '重点/示范校', desc: '省/市级重点中学' },
{ key: 'st_normal', name: '普通公办', desc: '常规公理中学' },
{ key: 'st_private', name: '私立/民办', desc: '高收费民办学校' },
{ key: 'st_intl', name: '国际学校', desc: '双语或国际课程学校' },
{ key: 'st_town', name: '乡镇/县域', desc: '非市区下沉学校' },
]
},
{
key: 'parent_job', name: '家长职业', color: '#3b82f6',
tags: [
{ key: 'pj_gov', name: '体制内/国企', desc: '公务员、教师、医生等' },
{ key: 'pj_corp', name: '企业白领/高管', desc: '外企、大厂、管理层' },
{ key: 'pj_biz', name: '个体/私营', desc: '企业主、商户' },
{ key: 'pj_free', name: '自由职业', desc: '弹性工作制' },
{ key: 'pj_fulltime', name: '全职妈妈', desc: '脱产带娃' },
{ key: 'pj_worker', name: '蓝领/基层', desc: '制造业、服务业基层' },
]
},
{
key: 'engagement', name: '活跃特征', color: '#06b6d4',
tags: [
{ key: 'eng_active_daily',name: '日活用户', desc: '每日登录做题/检查' },
{ key: 'eng_weekend', name: '周末活跃', desc: '集中在周末使用' },
{ key: 'eng_exam', name: '考前突击', desc: '期中/期末活跃度飙升' },
{ key: 'eng_dormant', name: '沉默用户', desc: '超过30天未登录' },
{ key: 'eng_paid', name: '付费会员', desc: '购买了长期课程/资料' },
]
},
{
key: 'device', name: '设备信息', color: '#64748b',
tags: [
{ key: 'dv_ios', name: 'iOS 主导', desc: '主要用 iPhone/iPad' },
{ key: 'dv_android', name: 'Android 主导', desc: '主要用安卓设备' },
{ key: 'dv_pad', name: '平板活跃', desc: '大量时间在Pad上学习' },
{ key: 'dv_pc', name: 'PC/网页端', desc: '常用电脑宽屏上课' },
]
},
];
// =============================================
// 数据生成
// =============================================
function random(min, max) {
return Math.floor(Math.random() * (max - min + 1)) + min;
}
function pick(arr) {
return arr[Math.floor(Math.random() * arr.length)];
}
function weightedPick(options) {
const total = options.reduce((s, o) => s + o.weight, 0);
let r = Math.random() * total;
for (const o of options) {
r -= o.weight;
if (r <= 0) return o.value;
}
return options[options.length - 1].value;
}
function seedData() {
initializeDatabase('onion');
const db = getDb('onion');
console.log('🏗️ 开始生成 洋葱客户大数据 模拟数据...\n');
const USER_COUNT = 50_000; // 用户要求 5 万数据
// === Step 1: 创建标签分类和标签 ===
console.log('📌 Step 1: 创建标签体系...');
const insertCat = db.prepare(
'INSERT INTO tag_categories (key, name, sort_order, color) VALUES (?, ?, ?, ?)'
);
const insertTag = db.prepare(
'INSERT INTO tags (key, name, category_id, description, sort_order) VALUES (?, ?, ?, ?, ?)'
);
const tagMap = {};
let totalTags = 0;
const txTags = db.transaction(() => {
TAG_SYSTEM.forEach((cat, ci) => {
const catRes = insertCat.run(cat.key, cat.name, ci, cat.color);
const catId = catRes.lastInsertRowid;
cat.tags.forEach((tag, ti) => {
const tagRes = insertTag.run(tag.key, tag.name, catId, tag.desc, ti);
tagMap[tag.key] = { id: Number(tagRes.lastInsertRowid), catKey: cat.key };
totalTags++;
});
});
});
txTags();
console.log(`${TAG_SYSTEM.length} 个分类,${totalTags} 个标签\n`);
// === Step 2: 生成用户 ===
console.log(`👥 Step 2: 生成 ${USER_COUNT.toLocaleString()} 个家长/学生用户...`);
const insertUser = db.prepare('INSERT INTO users (uid, name, email) VALUES (?, ?, ?)');
const insertUserTag = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
const BATCH = 5000;
let tagAssignments = 0;
for (let batch = 0; batch < USER_COUNT / BATCH; batch++) {
const tx = db.transaction(() => {
for (let i = 0; i < BATCH; i++) {
const idx = batch * BATCH + i + 1;
const uid = `u_${String(idx).padStart(7, '0')}`;
const userRes = insertUser.run(uid, `家长 ${idx}`, `parent${idx}@onion.example.com`);
const userId = userRes.lastInsertRowid;
const userTags = generateUserTags();
for (const tagKey of userTags) {
if (tagMap[tagKey]) {
insertUserTag.run(userId, tagMap[tagKey].id);
tagAssignments++;
}
}
}
});
tx();
if ((batch + 1) % 2 === 0 || batch === 0) {
process.stdout.write(` 进度: ${((batch + 1) * BATCH).toLocaleString()}/${USER_COUNT.toLocaleString()}\n`);
}
}
console.log(`\n${USER_COUNT.toLocaleString()} 个用户,${tagAssignments.toLocaleString()} 个标签关联\n`);
// === Step 3: 更新标签统计 ===
console.log('📈 Step 3: 统计标签覆盖...');
db.exec(`
UPDATE tags SET
coverage = (SELECT COUNT(*) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(*) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ${USER_COUNT}, 2)
`);
// 生成趋势数据(模拟 ±5 以内)
db.prepare('UPDATE tags SET trend = ? WHERE id = ?').bind(0, 0);
const allTags = db.prepare('SELECT id FROM tags').all();
const updateTrend = db.prepare('UPDATE tags SET trend = ? WHERE id = ?');
const txTrend = db.transaction(() => {
for (const t of allTags) {
updateTrend.run(Number((Math.random() * 10 - 3).toFixed(2)), t.id);
}
});
txTrend();
console.log(' ✅ 统计完成\n');
console.log('🎉 数据生成完毕!可启动 server.js');
db.close();
}
// =============================================
// 标签分配逻辑(相关性构建)
// =============================================
function generateUserTags() {
const tags = [];
// 家庭角色
const role = weightedPick([
{ value: 'pi_mom', weight: 60 },
{ value: 'pi_dad', weight: 20 },
{ value: 'pi_both', weight: 12 },
{ value: 'pi_single', weight: 5 },
{ value: 'pi_grand', weight: 3 },
]);
tags.push(role);
// 城市线级
const city = weightedPick([
{ value: 'ct_tier1', weight: 15 },
{ value: 'ct_new_tier1', weight: 25 },
{ value: 'ct_tier2', weight: 30 },
{ value: 'ct_tier3', weight: 28 },
{ value: 'ct_overseas', weight: 2 },
]);
tags.push(city);
// 收入分布与城市强相关
let income;
if (city === 'ct_tier1' || city === 'ct_overseas') {
income = weightedPick([{value:'inc_high',w:30}, {value:'inc_mid_high',w:40}, {value:'inc_mid',w:20}, {value:'inc_low',w:10}]);
} else if (city === 'ct_tier3') {
income = weightedPick([{value:'inc_high',w:5}, {value:'inc_mid_high',w:15}, {value:'inc_mid',w:40}, {value:'inc_low',w:40}]);
} else {
income = weightedPick([{value:'inc_high',w:10}, {value:'inc_mid_high',w:30}, {value:'inc_mid',w:40}, {value:'inc_low',w:20}]);
}
tags.push(income.value || income); // Handle object parsing if needed from previous logic, wait obj is {value, w}, let's fix weightedPick logic for inline
// Re-define for scope safety:
const getIncome = (city) => {
if (city === 'ct_tier1' || city === 'ct_overseas') return weightedPick([{value:'inc_high',weight:30}, {value:'inc_mid_high',weight:40}, {value:'inc_mid',weight:20}, {value:'inc_low',weight:10}]);
if (city === 'ct_tier3') return weightedPick([{value:'inc_high',weight:5}, {value:'inc_mid_high',weight:15}, {value:'inc_mid',weight:40}, {value:'inc_low',weight:40}]);
return weightedPick([{value:'inc_high',weight:10}, {value:'inc_mid_high',weight:30}, {value:'inc_mid',weight:40}, {value:'inc_low',weight:20}]);
}
const actualIncome = getIncome(city);
tags.push(actualIncome);
// 子女数量
const childCount = weightedPick([
{ value: 'cc_one', weight: 55 },
{ value: 'cc_two', weight: 40 },
{ value: 'cc_multi', weight: 5 },
]);
tags.push(childCount);
if (childCount === 'cc_two' || childCount === 'cc_multi') {
if (Math.random() < 0.3) tags.push('cc_cross');
}
// 学段及年级
const stage = weightedPick([
{ value: 'cs_mid', weight: 60 },
{ value: 'cs_high', weight: 40 },
]);
tags.push(stage);
if (stage === 'cs_mid') {
tags.push(weightedPick([{ value: 'cg_mid1', weight: 35 }, { value: 'cg_mid2', weight: 35 }, { value: 'cg_mid3', weight: 30 }]));
} else {
tags.push(weightedPick([{ value: 'cg_high1', weight: 40 }, { value: 'cg_high2', weight: 35 }, { value: 'cg_high3', weight: 25 }]));
}
// 多孩家庭大概率增加另一个年级标签跨界
if ((childCount === 'cc_two' || childCount === 'cc_multi') && Math.random() < 0.7) {
tags.push(pick(['cs_mid', 'cs_high']));
tags.push(pick(['cg_mid1', 'cg_mid2', 'cg_mid3', 'cg_high1', 'cg_high2', 'cg_high3']));
}
if (Math.random() < 0.25) tags.push('cs_transition');
const prefWeights = [];
if (actualIncome === 'inc_high' || city === 'ct_tier1') {
prefWeights.push({ value: 'sp_top', weight: 30 }, { value: 'sp_abroad', weight: 20 }, { value: 'sp_base', weight: 30 }, { value: 'sp_self', weight: 15 }, { value: 'sp_art', weight: 5 });
} else {
prefWeights.push({ value: 'sp_base', weight: 60 }, { value: 'sp_top', weight: 15 }, { value: 'sp_self', weight: 15 }, { value: 'sp_art', weight: 8 }, { value: 'sp_abroad', weight: 2 });
}
tags.push(weightedPick(prefWeights));
if (Math.random() < 0.6) tags.push(pick(['sp_base', 'sp_self', 'sp_top']));
// 薄弱学科 (多选2-3个以提高自然重合度)
tags.push(weightedPick([{ value: 'sw_math', weight: 35 }, { value: 'sw_english', weight: 25 }, { value: 'sw_science', weight: 20 }, { value: 'sw_arts', weight: 10 }, { value: 'sw_chinese', weight: 10 }]));
if (Math.random() < 0.7) tags.push(pick(['sw_math', 'sw_english', 'sw_science', 'sw_arts']));
if (Math.random() < 0.3) tags.push(pick(['sw_chinese', 'sw_arts', 'sw_science']));
// 学校类型
if (city === 'ct_tier3') {
tags.push(weightedPick([{value:'st_normal',weight:50}, {value:'st_town',weight:40}, {value:'st_key',weight:10}]));
} else {
const stWeights = [{value:'st_key',weight:30}, {value:'st_normal',weight:50}, {value:'st_private',weight:15}];
if (actualIncome === 'inc_high') stWeights.push({value:'st_intl',weight:15});
tags.push(weightedPick(stWeights));
}
// 家长职业 (可能父母职业不同,有概率选出两个)
if (role === 'pi_mom' && Math.random() < 0.2) {
tags.push('pj_fulltime');
} else {
tags.push(weightedPick([
{ value: 'pj_gov', weight: 25 }, { value: 'pj_corp', weight: 30 }, { value: 'pj_biz', weight: 15 }, { value: 'pj_free', weight: 10 }, { value: 'pj_worker', weight: 20 }
]));
}
if (Math.random() < 0.4) {
tags.push(pick(['pj_gov', 'pj_corp', 'pj_biz', 'pj_free']));
}
// 活跃特征 - 基于用户画像智能分配(改进版)
let engWeights;
// 高收入 + 培优拔高 → 大概率日活
if ((actualIncome === 'inc_high' || actualIncome === 'inc_mid_high') && tags.includes('sp_top')) {
engWeights = [
{ value: 'eng_active_daily', weight: 40 },
{ value: 'eng_weekend', weight: 30 },
{ value: 'eng_exam', weight: 20 },
{ value: 'eng_dormant', weight: 10 }
];
}
// 全职妈妈 → 高概率日活
else if (tags.includes('pj_fulltime')) {
engWeights = [
{ value: 'eng_active_daily', weight: 50 },
{ value: 'eng_weekend', weight: 25 },
{ value: 'eng_exam', weight: 15 },
{ value: 'eng_dormant', weight: 10 }
];
}
// 体制内/国企 → 中等日活概率
else if (tags.includes('pj_gov')) {
engWeights = [
{ value: 'eng_active_daily', weight: 30 },
{ value: 'eng_weekend', weight: 35 },
{ value: 'eng_exam', weight: 25 },
{ value: 'eng_dormant', weight: 10 }
];
}
// 高收入用户整体活跃度高
else if (actualIncome === 'inc_high') {
engWeights = [
{ value: 'eng_active_daily', weight: 35 },
{ value: 'eng_weekend', weight: 30 },
{ value: 'eng_exam', weight: 25 },
{ value: 'eng_dormant', weight: 10 }
];
}
// 其他情况
else {
engWeights = [
{ value: 'eng_active_daily', weight: 20 },
{ value: 'eng_weekend', weight: 35 },
{ value: 'eng_exam', weight: 30 },
{ value: 'eng_dormant', weight: 15 }
];
}
tags.push(weightedPick(engWeights));
if (Math.random() < 0.4) tags.push('eng_exam');
// 付费会员:高收入、培优拔高、日活用户更可能付费
if (actualIncome === 'inc_high' || tags.includes('sp_top') || tags.includes('eng_active_daily')) {
if (Math.random() < 0.5) tags.push('eng_paid');
} else if (Math.random() < 0.15) {
tags.push('eng_paid');
}
// 设备 (跨设备活跃十分常见)
tags.push(weightedPick([{ value: 'dv_ios', weight: 40 }, { value: 'dv_android', weight: 50 }, { value: 'dv_pc', weight: 10 }]));
if (Math.random() < 0.5) tags.push('dv_pad');
if (Math.random() < 0.3) tags.push('dv_pc');
if (Math.random() < 0.3) tags.push(pick(['dv_ios', 'dv_android']));
// 去重 (防止 push 重复 tag 导致 SQLite Unique 报错虽被 IGNORE但尽量在内存中干净)
return [...new Set(tags)];
}
seedData();