432 lines
18 KiB
JavaScript
432 lines
18 KiB
JavaScript
/**
|
||
* 数据种子 — 洋葱客户大数据标签系统 (K12教育/家长画像)
|
||
*
|
||
* 目标受众:初中、高中学生的家长,关注家庭结构、学业情况、消费能力等。
|
||
*/
|
||
|
||
const { getDb, initializeDatabase } = require('./init');
|
||
|
||
// =============================================
|
||
// 洋葱客户标签体系定义
|
||
// =============================================
|
||
|
||
const TAG_SYSTEM = [
|
||
{
|
||
key: 'parent_identity', name: '家庭角色', color: '#6366f1',
|
||
tags: [
|
||
{ key: 'pi_mom', name: '母亲主导', desc: '主要由母亲参与管理' },
|
||
{ key: 'pi_dad', name: '父亲主导', desc: '主要由父亲参与管理' },
|
||
{ key: 'pi_both', name: '双亲共育', desc: '父母共同活跃参与' },
|
||
{ key: 'pi_single', name: '单亲家庭', desc: '系统标识为单亲状态' },
|
||
{ key: 'pi_grand', name: '隔代参与', desc: '祖辈有关联操作或代付' },
|
||
]
|
||
},
|
||
{
|
||
key: 'city_level', name: '所在城市', color: '#8b5cf6',
|
||
tags: [
|
||
{ key: 'ct_tier1', name: '一线城市', desc: '北上广深' },
|
||
{ key: 'ct_new_tier1', name: '新一线', desc: '杭州/成都/武汉等15城' },
|
||
{ key: 'ct_tier2', name: '二线城市', desc: '省会及副省级城市' },
|
||
{ key: 'ct_tier3', name: '三线及以下', desc: '地级市及县城下沉市场' },
|
||
{ key: 'ct_overseas', name: '海外及港澳台', desc: '非大陆地区访问' },
|
||
]
|
||
},
|
||
{
|
||
key: 'income', name: '家庭月收入', color: '#a855f7',
|
||
tags: [
|
||
{ key: 'inc_high', name: '高收入 (>5w)', desc: '家庭月收入5万以上' },
|
||
{ key: 'inc_mid_high', name: '中高 (2w-5w)', desc: '家庭月收入2万至5万' },
|
||
{ key: 'inc_mid', name: '中等 (1w-2w)', desc: '家庭月收入1万至2万' },
|
||
{ key: 'inc_low', name: '偏低 (<1w)', desc: '家庭月收入1万以下' },
|
||
]
|
||
},
|
||
{
|
||
key: 'child_count', name: '子女数量', color: '#ec4899',
|
||
tags: [
|
||
{ key: 'cc_one', name: '独生子女', desc: '仅有一个孩子注册' },
|
||
{ key: 'cc_two', name: '二胎家庭', desc: '绑定两个孩子' },
|
||
{ key: 'cc_multi', name: '三胎及以上', desc: '绑定三个及以上孩子' },
|
||
{ key: 'cc_cross', name: '跨学段多孩', desc: '多个孩子处于不同学段' },
|
||
]
|
||
},
|
||
{
|
||
key: 'child_stage', name: '孩子学段', color: '#f59e0b',
|
||
tags: [
|
||
{ key: 'cs_mid', name: '初中阶段', desc: '处于初一至初三年级' },
|
||
{ key: 'cs_high', name: '高中阶段', desc: '处于高一至高三年级' },
|
||
{ key: 'cs_transition', name: '小升初/初升高', desc: '处于升学接轨期' },
|
||
]
|
||
},
|
||
{
|
||
key: 'child_grade', name: '具体年级', color: '#f97316',
|
||
tags: [
|
||
{ key: 'cg_mid1', name: '初一 (7年级)', desc: '初一年级' },
|
||
{ key: 'cg_mid2', name: '初二 (8年级)', desc: '初二年级' },
|
||
{ key: 'cg_mid3', name: '初三 (9年级)', desc: '中考备战期' },
|
||
{ key: 'cg_high1', name: '高一 (10年级)', desc: '高一年级' },
|
||
{ key: 'cg_high2', name: '高二 (11年级)', desc: '高二分班/学考期' },
|
||
{ key: 'cg_high3', name: '高三 (12年级)', desc: '高考冲刺期' },
|
||
]
|
||
},
|
||
{
|
||
key: 'study_pref', name: '学习偏好', color: '#ef4444',
|
||
tags: [
|
||
{ key: 'sp_top', name: '培优拔高', desc: '注重竞赛、难题突破' },
|
||
{ key: 'sp_base', name: '基础巩固', desc: '注重课内知识达标' },
|
||
{ key: 'sp_art', name: '艺体生', desc: '艺术/体育专业方向考学' },
|
||
{ key: 'sp_abroad', name: '出国留学', desc: '有国际路线意向' },
|
||
{ key: 'sp_self', name: '自主探究', desc: '孩子主动学习能力强' },
|
||
]
|
||
},
|
||
{
|
||
key: 'subject_weak', name: '薄弱学科', color: '#14b8a6',
|
||
tags: [
|
||
{ key: 'sw_math', name: '数学薄弱', desc: '数学经常低于平均分' },
|
||
{ key: 'sw_english', name: '英语薄弱', desc: '英语单词/听力为短板' },
|
||
{ key: 'sw_science', name: '理综薄弱', desc: '物理/化学跨学科困难' },
|
||
{ key: 'sw_arts', name: '文综薄弱', desc: '政史地背诵/理解困难' },
|
||
{ key: 'sw_chinese', name: '语文薄弱', desc: '阅读理解/作文得分低' },
|
||
]
|
||
},
|
||
{
|
||
key: 'school_type', name: '学校类型', color: '#22c55e',
|
||
tags: [
|
||
{ key: 'st_key', name: '重点/示范校', desc: '省/市级重点中学' },
|
||
{ key: 'st_normal', name: '普通公办', desc: '常规公理中学' },
|
||
{ key: 'st_private', name: '私立/民办', desc: '高收费民办学校' },
|
||
{ key: 'st_intl', name: '国际学校', desc: '双语或国际课程学校' },
|
||
{ key: 'st_town', name: '乡镇/县域', desc: '非市区下沉学校' },
|
||
]
|
||
},
|
||
{
|
||
key: 'parent_job', name: '家长职业', color: '#3b82f6',
|
||
tags: [
|
||
{ key: 'pj_gov', name: '体制内/国企', desc: '公务员、教师、医生等' },
|
||
{ key: 'pj_corp', name: '企业白领/高管', desc: '外企、大厂、管理层' },
|
||
{ key: 'pj_biz', name: '个体/私营', desc: '企业主、商户' },
|
||
{ key: 'pj_free', name: '自由职业', desc: '弹性工作制' },
|
||
{ key: 'pj_fulltime', name: '全职妈妈', desc: '脱产带娃' },
|
||
{ key: 'pj_worker', name: '蓝领/基层', desc: '制造业、服务业基层' },
|
||
]
|
||
},
|
||
{
|
||
key: 'engagement', name: '活跃特征', color: '#06b6d4',
|
||
tags: [
|
||
{ key: 'eng_active_daily',name: '日活用户', desc: '每日登录做题/检查' },
|
||
{ key: 'eng_weekend', name: '周末活跃', desc: '集中在周末使用' },
|
||
{ key: 'eng_exam', name: '考前突击', desc: '期中/期末活跃度飙升' },
|
||
{ key: 'eng_dormant', name: '沉默用户', desc: '超过30天未登录' },
|
||
{ key: 'eng_paid', name: '付费会员', desc: '购买了长期课程/资料' },
|
||
]
|
||
},
|
||
{
|
||
key: 'device', name: '设备信息', color: '#64748b',
|
||
tags: [
|
||
{ key: 'dv_ios', name: 'iOS 主导', desc: '主要用 iPhone/iPad' },
|
||
{ key: 'dv_android', name: 'Android 主导', desc: '主要用安卓设备' },
|
||
{ key: 'dv_pad', name: '平板活跃', desc: '大量时间在Pad上学习' },
|
||
{ key: 'dv_pc', name: 'PC/网页端', desc: '常用电脑宽屏上课' },
|
||
]
|
||
},
|
||
];
|
||
|
||
// =============================================
|
||
// 数据生成
|
||
// =============================================
|
||
|
||
function random(min, max) {
|
||
return Math.floor(Math.random() * (max - min + 1)) + min;
|
||
}
|
||
|
||
function pick(arr) {
|
||
return arr[Math.floor(Math.random() * arr.length)];
|
||
}
|
||
|
||
function weightedPick(options) {
|
||
const total = options.reduce((s, o) => s + o.weight, 0);
|
||
let r = Math.random() * total;
|
||
for (const o of options) {
|
||
r -= o.weight;
|
||
if (r <= 0) return o.value;
|
||
}
|
||
return options[options.length - 1].value;
|
||
}
|
||
|
||
function seedData() {
|
||
initializeDatabase('onion');
|
||
const db = getDb('onion');
|
||
|
||
console.log('🏗️ 开始生成 洋葱客户大数据 模拟数据...\n');
|
||
|
||
const USER_COUNT = 50_000; // 用户要求 5 万数据
|
||
|
||
// === Step 1: 创建标签分类和标签 ===
|
||
console.log('📌 Step 1: 创建标签体系...');
|
||
|
||
const insertCat = db.prepare(
|
||
'INSERT INTO tag_categories (key, name, sort_order, color) VALUES (?, ?, ?, ?)'
|
||
);
|
||
const insertTag = db.prepare(
|
||
'INSERT INTO tags (key, name, category_id, description, sort_order) VALUES (?, ?, ?, ?, ?)'
|
||
);
|
||
|
||
const tagMap = {};
|
||
let totalTags = 0;
|
||
|
||
const txTags = db.transaction(() => {
|
||
TAG_SYSTEM.forEach((cat, ci) => {
|
||
const catRes = insertCat.run(cat.key, cat.name, ci, cat.color);
|
||
const catId = catRes.lastInsertRowid;
|
||
|
||
cat.tags.forEach((tag, ti) => {
|
||
const tagRes = insertTag.run(tag.key, tag.name, catId, tag.desc, ti);
|
||
tagMap[tag.key] = { id: Number(tagRes.lastInsertRowid), catKey: cat.key };
|
||
totalTags++;
|
||
});
|
||
});
|
||
});
|
||
txTags();
|
||
console.log(` ✅ ${TAG_SYSTEM.length} 个分类,${totalTags} 个标签\n`);
|
||
|
||
// === Step 2: 生成用户 ===
|
||
console.log(`👥 Step 2: 生成 ${USER_COUNT.toLocaleString()} 个家长/学生用户...`);
|
||
|
||
const insertUser = db.prepare('INSERT INTO users (uid, name, email) VALUES (?, ?, ?)');
|
||
const insertUserTag = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
|
||
|
||
const BATCH = 5000;
|
||
let tagAssignments = 0;
|
||
|
||
for (let batch = 0; batch < USER_COUNT / BATCH; batch++) {
|
||
const tx = db.transaction(() => {
|
||
for (let i = 0; i < BATCH; i++) {
|
||
const idx = batch * BATCH + i + 1;
|
||
const uid = `u_${String(idx).padStart(7, '0')}`;
|
||
|
||
const userRes = insertUser.run(uid, `家长 ${idx}`, `parent${idx}@onion.example.com`);
|
||
const userId = userRes.lastInsertRowid;
|
||
|
||
const userTags = generateUserTags();
|
||
for (const tagKey of userTags) {
|
||
if (tagMap[tagKey]) {
|
||
insertUserTag.run(userId, tagMap[tagKey].id);
|
||
tagAssignments++;
|
||
}
|
||
}
|
||
}
|
||
});
|
||
tx();
|
||
|
||
if ((batch + 1) % 2 === 0 || batch === 0) {
|
||
process.stdout.write(` 进度: ${((batch + 1) * BATCH).toLocaleString()}/${USER_COUNT.toLocaleString()}\n`);
|
||
}
|
||
}
|
||
console.log(`\n ✅ ${USER_COUNT.toLocaleString()} 个用户,${tagAssignments.toLocaleString()} 个标签关联\n`);
|
||
|
||
// === Step 3: 更新标签统计 ===
|
||
console.log('📈 Step 3: 统计标签覆盖...');
|
||
db.exec(`
|
||
UPDATE tags SET
|
||
coverage = (SELECT COUNT(*) FROM user_tags WHERE tag_id = tags.id),
|
||
coverage_rate = ROUND((SELECT COUNT(*) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ${USER_COUNT}, 2)
|
||
`);
|
||
|
||
// 生成趋势数据(模拟 ±5 以内)
|
||
db.prepare('UPDATE tags SET trend = ? WHERE id = ?').bind(0, 0);
|
||
const allTags = db.prepare('SELECT id FROM tags').all();
|
||
const updateTrend = db.prepare('UPDATE tags SET trend = ? WHERE id = ?');
|
||
const txTrend = db.transaction(() => {
|
||
for (const t of allTags) {
|
||
updateTrend.run(Number((Math.random() * 10 - 3).toFixed(2)), t.id);
|
||
}
|
||
});
|
||
txTrend();
|
||
|
||
console.log(' ✅ 统计完成\n');
|
||
console.log('🎉 数据生成完毕!可启动 server.js');
|
||
db.close();
|
||
}
|
||
|
||
// =============================================
|
||
// 标签分配逻辑(相关性构建)
|
||
// =============================================
|
||
|
||
function generateUserTags() {
|
||
const tags = [];
|
||
|
||
// 家庭角色
|
||
const role = weightedPick([
|
||
{ value: 'pi_mom', weight: 60 },
|
||
{ value: 'pi_dad', weight: 20 },
|
||
{ value: 'pi_both', weight: 12 },
|
||
{ value: 'pi_single', weight: 5 },
|
||
{ value: 'pi_grand', weight: 3 },
|
||
]);
|
||
tags.push(role);
|
||
|
||
// 城市线级
|
||
const city = weightedPick([
|
||
{ value: 'ct_tier1', weight: 15 },
|
||
{ value: 'ct_new_tier1', weight: 25 },
|
||
{ value: 'ct_tier2', weight: 30 },
|
||
{ value: 'ct_tier3', weight: 28 },
|
||
{ value: 'ct_overseas', weight: 2 },
|
||
]);
|
||
tags.push(city);
|
||
|
||
// 收入分布与城市强相关
|
||
let income;
|
||
if (city === 'ct_tier1' || city === 'ct_overseas') {
|
||
income = weightedPick([{value:'inc_high',w:30}, {value:'inc_mid_high',w:40}, {value:'inc_mid',w:20}, {value:'inc_low',w:10}]);
|
||
} else if (city === 'ct_tier3') {
|
||
income = weightedPick([{value:'inc_high',w:5}, {value:'inc_mid_high',w:15}, {value:'inc_mid',w:40}, {value:'inc_low',w:40}]);
|
||
} else {
|
||
income = weightedPick([{value:'inc_high',w:10}, {value:'inc_mid_high',w:30}, {value:'inc_mid',w:40}, {value:'inc_low',w:20}]);
|
||
}
|
||
tags.push(income.value || income); // Handle object parsing if needed from previous logic, wait obj is {value, w}, let's fix weightedPick logic for inline
|
||
|
||
// Re-define for scope safety:
|
||
const getIncome = (city) => {
|
||
if (city === 'ct_tier1' || city === 'ct_overseas') return weightedPick([{value:'inc_high',weight:30}, {value:'inc_mid_high',weight:40}, {value:'inc_mid',weight:20}, {value:'inc_low',weight:10}]);
|
||
if (city === 'ct_tier3') return weightedPick([{value:'inc_high',weight:5}, {value:'inc_mid_high',weight:15}, {value:'inc_mid',weight:40}, {value:'inc_low',weight:40}]);
|
||
return weightedPick([{value:'inc_high',weight:10}, {value:'inc_mid_high',weight:30}, {value:'inc_mid',weight:40}, {value:'inc_low',weight:20}]);
|
||
}
|
||
const actualIncome = getIncome(city);
|
||
tags.push(actualIncome);
|
||
|
||
// 子女数量
|
||
const childCount = weightedPick([
|
||
{ value: 'cc_one', weight: 55 },
|
||
{ value: 'cc_two', weight: 40 },
|
||
{ value: 'cc_multi', weight: 5 },
|
||
]);
|
||
tags.push(childCount);
|
||
if (childCount === 'cc_two' || childCount === 'cc_multi') {
|
||
if (Math.random() < 0.3) tags.push('cc_cross');
|
||
}
|
||
|
||
// 学段及年级
|
||
const stage = weightedPick([
|
||
{ value: 'cs_mid', weight: 60 },
|
||
{ value: 'cs_high', weight: 40 },
|
||
]);
|
||
tags.push(stage);
|
||
|
||
if (stage === 'cs_mid') {
|
||
tags.push(weightedPick([{ value: 'cg_mid1', weight: 35 }, { value: 'cg_mid2', weight: 35 }, { value: 'cg_mid3', weight: 30 }]));
|
||
} else {
|
||
tags.push(weightedPick([{ value: 'cg_high1', weight: 40 }, { value: 'cg_high2', weight: 35 }, { value: 'cg_high3', weight: 25 }]));
|
||
}
|
||
// 多孩家庭大概率增加另一个年级标签跨界
|
||
if ((childCount === 'cc_two' || childCount === 'cc_multi') && Math.random() < 0.7) {
|
||
tags.push(pick(['cs_mid', 'cs_high']));
|
||
tags.push(pick(['cg_mid1', 'cg_mid2', 'cg_mid3', 'cg_high1', 'cg_high2', 'cg_high3']));
|
||
}
|
||
if (Math.random() < 0.25) tags.push('cs_transition');
|
||
|
||
const prefWeights = [];
|
||
if (actualIncome === 'inc_high' || city === 'ct_tier1') {
|
||
prefWeights.push({ value: 'sp_top', weight: 30 }, { value: 'sp_abroad', weight: 20 }, { value: 'sp_base', weight: 30 }, { value: 'sp_self', weight: 15 }, { value: 'sp_art', weight: 5 });
|
||
} else {
|
||
prefWeights.push({ value: 'sp_base', weight: 60 }, { value: 'sp_top', weight: 15 }, { value: 'sp_self', weight: 15 }, { value: 'sp_art', weight: 8 }, { value: 'sp_abroad', weight: 2 });
|
||
}
|
||
tags.push(weightedPick(prefWeights));
|
||
if (Math.random() < 0.6) tags.push(pick(['sp_base', 'sp_self', 'sp_top']));
|
||
|
||
// 薄弱学科 (多选2-3个以提高自然重合度)
|
||
tags.push(weightedPick([{ value: 'sw_math', weight: 35 }, { value: 'sw_english', weight: 25 }, { value: 'sw_science', weight: 20 }, { value: 'sw_arts', weight: 10 }, { value: 'sw_chinese', weight: 10 }]));
|
||
if (Math.random() < 0.7) tags.push(pick(['sw_math', 'sw_english', 'sw_science', 'sw_arts']));
|
||
if (Math.random() < 0.3) tags.push(pick(['sw_chinese', 'sw_arts', 'sw_science']));
|
||
|
||
// 学校类型
|
||
if (city === 'ct_tier3') {
|
||
tags.push(weightedPick([{value:'st_normal',weight:50}, {value:'st_town',weight:40}, {value:'st_key',weight:10}]));
|
||
} else {
|
||
const stWeights = [{value:'st_key',weight:30}, {value:'st_normal',weight:50}, {value:'st_private',weight:15}];
|
||
if (actualIncome === 'inc_high') stWeights.push({value:'st_intl',weight:15});
|
||
tags.push(weightedPick(stWeights));
|
||
}
|
||
|
||
// 家长职业 (可能父母职业不同,有概率选出两个)
|
||
if (role === 'pi_mom' && Math.random() < 0.2) {
|
||
tags.push('pj_fulltime');
|
||
} else {
|
||
tags.push(weightedPick([
|
||
{ value: 'pj_gov', weight: 25 }, { value: 'pj_corp', weight: 30 }, { value: 'pj_biz', weight: 15 }, { value: 'pj_free', weight: 10 }, { value: 'pj_worker', weight: 20 }
|
||
]));
|
||
}
|
||
if (Math.random() < 0.4) {
|
||
tags.push(pick(['pj_gov', 'pj_corp', 'pj_biz', 'pj_free']));
|
||
}
|
||
|
||
// 活跃特征 - 基于用户画像智能分配(改进版)
|
||
let engWeights;
|
||
|
||
// 高收入 + 培优拔高 → 大概率日活
|
||
if ((actualIncome === 'inc_high' || actualIncome === 'inc_mid_high') && tags.includes('sp_top')) {
|
||
engWeights = [
|
||
{ value: 'eng_active_daily', weight: 40 },
|
||
{ value: 'eng_weekend', weight: 30 },
|
||
{ value: 'eng_exam', weight: 20 },
|
||
{ value: 'eng_dormant', weight: 10 }
|
||
];
|
||
}
|
||
// 全职妈妈 → 高概率日活
|
||
else if (tags.includes('pj_fulltime')) {
|
||
engWeights = [
|
||
{ value: 'eng_active_daily', weight: 50 },
|
||
{ value: 'eng_weekend', weight: 25 },
|
||
{ value: 'eng_exam', weight: 15 },
|
||
{ value: 'eng_dormant', weight: 10 }
|
||
];
|
||
}
|
||
// 体制内/国企 → 中等日活概率
|
||
else if (tags.includes('pj_gov')) {
|
||
engWeights = [
|
||
{ value: 'eng_active_daily', weight: 30 },
|
||
{ value: 'eng_weekend', weight: 35 },
|
||
{ value: 'eng_exam', weight: 25 },
|
||
{ value: 'eng_dormant', weight: 10 }
|
||
];
|
||
}
|
||
// 高收入用户整体活跃度高
|
||
else if (actualIncome === 'inc_high') {
|
||
engWeights = [
|
||
{ value: 'eng_active_daily', weight: 35 },
|
||
{ value: 'eng_weekend', weight: 30 },
|
||
{ value: 'eng_exam', weight: 25 },
|
||
{ value: 'eng_dormant', weight: 10 }
|
||
];
|
||
}
|
||
// 其他情况
|
||
else {
|
||
engWeights = [
|
||
{ value: 'eng_active_daily', weight: 20 },
|
||
{ value: 'eng_weekend', weight: 35 },
|
||
{ value: 'eng_exam', weight: 30 },
|
||
{ value: 'eng_dormant', weight: 15 }
|
||
];
|
||
}
|
||
|
||
tags.push(weightedPick(engWeights));
|
||
if (Math.random() < 0.4) tags.push('eng_exam');
|
||
|
||
// 付费会员:高收入、培优拔高、日活用户更可能付费
|
||
if (actualIncome === 'inc_high' || tags.includes('sp_top') || tags.includes('eng_active_daily')) {
|
||
if (Math.random() < 0.5) tags.push('eng_paid');
|
||
} else if (Math.random() < 0.15) {
|
||
tags.push('eng_paid');
|
||
}
|
||
|
||
// 设备 (跨设备活跃十分常见)
|
||
tags.push(weightedPick([{ value: 'dv_ios', weight: 40 }, { value: 'dv_android', weight: 50 }, { value: 'dv_pc', weight: 10 }]));
|
||
if (Math.random() < 0.5) tags.push('dv_pad');
|
||
if (Math.random() < 0.3) tags.push('dv_pc');
|
||
if (Math.random() < 0.3) tags.push(pick(['dv_ios', 'dv_android']));
|
||
|
||
// 去重 (防止 push 重复 tag 导致 SQLite Unique 报错虽被 IGNORE,但尽量在内存中干净)
|
||
return [...new Set(tags)];
|
||
}
|
||
|
||
seedData();
|