Update README and project cleanup

This commit is contained in:
inkling
2026-04-08 14:52:09 +08:00
commit fafd267288
71 changed files with 14865 additions and 0 deletions

431
db/seed.js Normal file
View File

@@ -0,0 +1,431 @@
/**
* 数据种子 — 洋葱客户大数据标签系统 (K12教育/家长画像)
*
* 目标受众:初中、高中学生的家长,关注家庭结构、学业情况、消费能力等。
*/
const { getDb, initializeDatabase } = require('./init');
// =============================================
// 洋葱客户标签体系定义
// =============================================
const TAG_SYSTEM = [
{
key: 'parent_identity', name: '家庭角色', color: '#6366f1',
tags: [
{ key: 'pi_mom', name: '母亲主导', desc: '主要由母亲参与管理' },
{ key: 'pi_dad', name: '父亲主导', desc: '主要由父亲参与管理' },
{ key: 'pi_both', name: '双亲共育', desc: '父母共同活跃参与' },
{ key: 'pi_single', name: '单亲家庭', desc: '系统标识为单亲状态' },
{ key: 'pi_grand', name: '隔代参与', desc: '祖辈有关联操作或代付' },
]
},
{
key: 'city_level', name: '所在城市', color: '#8b5cf6',
tags: [
{ key: 'ct_tier1', name: '一线城市', desc: '北上广深' },
{ key: 'ct_new_tier1', name: '新一线', desc: '杭州/成都/武汉等15城' },
{ key: 'ct_tier2', name: '二线城市', desc: '省会及副省级城市' },
{ key: 'ct_tier3', name: '三线及以下', desc: '地级市及县城下沉市场' },
{ key: 'ct_overseas', name: '海外及港澳台', desc: '非大陆地区访问' },
]
},
{
key: 'income', name: '家庭月收入', color: '#a855f7',
tags: [
{ key: 'inc_high', name: '高收入 (>5w)', desc: '家庭月收入5万以上' },
{ key: 'inc_mid_high', name: '中高 (2w-5w)', desc: '家庭月收入2万至5万' },
{ key: 'inc_mid', name: '中等 (1w-2w)', desc: '家庭月收入1万至2万' },
{ key: 'inc_low', name: '偏低 (<1w)', desc: '家庭月收入1万以下' },
]
},
{
key: 'child_count', name: '子女数量', color: '#ec4899',
tags: [
{ key: 'cc_one', name: '独生子女', desc: '仅有一个孩子注册' },
{ key: 'cc_two', name: '二胎家庭', desc: '绑定两个孩子' },
{ key: 'cc_multi', name: '三胎及以上', desc: '绑定三个及以上孩子' },
{ key: 'cc_cross', name: '跨学段多孩', desc: '多个孩子处于不同学段' },
]
},
{
key: 'child_stage', name: '孩子学段', color: '#f59e0b',
tags: [
{ key: 'cs_mid', name: '初中阶段', desc: '处于初一至初三年级' },
{ key: 'cs_high', name: '高中阶段', desc: '处于高一至高三年级' },
{ key: 'cs_transition', name: '小升初/初升高', desc: '处于升学接轨期' },
]
},
{
key: 'child_grade', name: '具体年级', color: '#f97316',
tags: [
{ key: 'cg_mid1', name: '初一 (7年级)', desc: '初一年级' },
{ key: 'cg_mid2', name: '初二 (8年级)', desc: '初二年级' },
{ key: 'cg_mid3', name: '初三 (9年级)', desc: '中考备战期' },
{ key: 'cg_high1', name: '高一 (10年级)', desc: '高一年级' },
{ key: 'cg_high2', name: '高二 (11年级)', desc: '高二分班/学考期' },
{ key: 'cg_high3', name: '高三 (12年级)', desc: '高考冲刺期' },
]
},
{
key: 'study_pref', name: '学习偏好', color: '#ef4444',
tags: [
{ key: 'sp_top', name: '培优拔高', desc: '注重竞赛、难题突破' },
{ key: 'sp_base', name: '基础巩固', desc: '注重课内知识达标' },
{ key: 'sp_art', name: '艺体生', desc: '艺术/体育专业方向考学' },
{ key: 'sp_abroad', name: '出国留学', desc: '有国际路线意向' },
{ key: 'sp_self', name: '自主探究', desc: '孩子主动学习能力强' },
]
},
{
key: 'subject_weak', name: '薄弱学科', color: '#14b8a6',
tags: [
{ key: 'sw_math', name: '数学薄弱', desc: '数学经常低于平均分' },
{ key: 'sw_english', name: '英语薄弱', desc: '英语单词/听力为短板' },
{ key: 'sw_science', name: '理综薄弱', desc: '物理/化学跨学科困难' },
{ key: 'sw_arts', name: '文综薄弱', desc: '政史地背诵/理解困难' },
{ key: 'sw_chinese', name: '语文薄弱', desc: '阅读理解/作文得分低' },
]
},
{
key: 'school_type', name: '学校类型', color: '#22c55e',
tags: [
{ key: 'st_key', name: '重点/示范校', desc: '省/市级重点中学' },
{ key: 'st_normal', name: '普通公办', desc: '常规公理中学' },
{ key: 'st_private', name: '私立/民办', desc: '高收费民办学校' },
{ key: 'st_intl', name: '国际学校', desc: '双语或国际课程学校' },
{ key: 'st_town', name: '乡镇/县域', desc: '非市区下沉学校' },
]
},
{
key: 'parent_job', name: '家长职业', color: '#3b82f6',
tags: [
{ key: 'pj_gov', name: '体制内/国企', desc: '公务员、教师、医生等' },
{ key: 'pj_corp', name: '企业白领/高管', desc: '外企、大厂、管理层' },
{ key: 'pj_biz', name: '个体/私营', desc: '企业主、商户' },
{ key: 'pj_free', name: '自由职业', desc: '弹性工作制' },
{ key: 'pj_fulltime', name: '全职妈妈', desc: '脱产带娃' },
{ key: 'pj_worker', name: '蓝领/基层', desc: '制造业、服务业基层' },
]
},
{
key: 'engagement', name: '活跃特征', color: '#06b6d4',
tags: [
{ key: 'eng_active_daily',name: '日活用户', desc: '每日登录做题/检查' },
{ key: 'eng_weekend', name: '周末活跃', desc: '集中在周末使用' },
{ key: 'eng_exam', name: '考前突击', desc: '期中/期末活跃度飙升' },
{ key: 'eng_dormant', name: '沉默用户', desc: '超过30天未登录' },
{ key: 'eng_paid', name: '付费会员', desc: '购买了长期课程/资料' },
]
},
{
key: 'device', name: '设备信息', color: '#64748b',
tags: [
{ key: 'dv_ios', name: 'iOS 主导', desc: '主要用 iPhone/iPad' },
{ key: 'dv_android', name: 'Android 主导', desc: '主要用安卓设备' },
{ key: 'dv_pad', name: '平板活跃', desc: '大量时间在Pad上学习' },
{ key: 'dv_pc', name: 'PC/网页端', desc: '常用电脑宽屏上课' },
]
},
];
// =============================================
// 数据生成
// =============================================
function random(min, max) {
return Math.floor(Math.random() * (max - min + 1)) + min;
}
function pick(arr) {
return arr[Math.floor(Math.random() * arr.length)];
}
function weightedPick(options) {
const total = options.reduce((s, o) => s + o.weight, 0);
let r = Math.random() * total;
for (const o of options) {
r -= o.weight;
if (r <= 0) return o.value;
}
return options[options.length - 1].value;
}
function seedData() {
initializeDatabase('onion');
const db = getDb('onion');
console.log('🏗️ 开始生成 洋葱客户大数据 模拟数据...\n');
const USER_COUNT = 50_000; // 用户要求 5 万数据
// === Step 1: 创建标签分类和标签 ===
console.log('📌 Step 1: 创建标签体系...');
const insertCat = db.prepare(
'INSERT INTO tag_categories (key, name, sort_order, color) VALUES (?, ?, ?, ?)'
);
const insertTag = db.prepare(
'INSERT INTO tags (key, name, category_id, description, sort_order) VALUES (?, ?, ?, ?, ?)'
);
const tagMap = {};
let totalTags = 0;
const txTags = db.transaction(() => {
TAG_SYSTEM.forEach((cat, ci) => {
const catRes = insertCat.run(cat.key, cat.name, ci, cat.color);
const catId = catRes.lastInsertRowid;
cat.tags.forEach((tag, ti) => {
const tagRes = insertTag.run(tag.key, tag.name, catId, tag.desc, ti);
tagMap[tag.key] = { id: Number(tagRes.lastInsertRowid), catKey: cat.key };
totalTags++;
});
});
});
txTags();
console.log(`${TAG_SYSTEM.length} 个分类,${totalTags} 个标签\n`);
// === Step 2: 生成用户 ===
console.log(`👥 Step 2: 生成 ${USER_COUNT.toLocaleString()} 个家长/学生用户...`);
const insertUser = db.prepare('INSERT INTO users (uid, name, email) VALUES (?, ?, ?)');
const insertUserTag = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
const BATCH = 5000;
let tagAssignments = 0;
for (let batch = 0; batch < USER_COUNT / BATCH; batch++) {
const tx = db.transaction(() => {
for (let i = 0; i < BATCH; i++) {
const idx = batch * BATCH + i + 1;
const uid = `u_${String(idx).padStart(7, '0')}`;
const userRes = insertUser.run(uid, `家长 ${idx}`, `parent${idx}@onion.example.com`);
const userId = userRes.lastInsertRowid;
const userTags = generateUserTags();
for (const tagKey of userTags) {
if (tagMap[tagKey]) {
insertUserTag.run(userId, tagMap[tagKey].id);
tagAssignments++;
}
}
}
});
tx();
if ((batch + 1) % 2 === 0 || batch === 0) {
process.stdout.write(` 进度: ${((batch + 1) * BATCH).toLocaleString()}/${USER_COUNT.toLocaleString()}\n`);
}
}
console.log(`\n${USER_COUNT.toLocaleString()} 个用户,${tagAssignments.toLocaleString()} 个标签关联\n`);
// === Step 3: 更新标签统计 ===
console.log('📈 Step 3: 统计标签覆盖...');
db.exec(`
UPDATE tags SET
coverage = (SELECT COUNT(*) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(*) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ${USER_COUNT}, 2)
`);
// 生成趋势数据(模拟 ±5 以内)
db.prepare('UPDATE tags SET trend = ? WHERE id = ?').bind(0, 0);
const allTags = db.prepare('SELECT id FROM tags').all();
const updateTrend = db.prepare('UPDATE tags SET trend = ? WHERE id = ?');
const txTrend = db.transaction(() => {
for (const t of allTags) {
updateTrend.run(Number((Math.random() * 10 - 3).toFixed(2)), t.id);
}
});
txTrend();
console.log(' ✅ 统计完成\n');
console.log('🎉 数据生成完毕!可启动 server.js');
db.close();
}
// =============================================
// 标签分配逻辑(相关性构建)
// =============================================
function generateUserTags() {
const tags = [];
// 家庭角色
const role = weightedPick([
{ value: 'pi_mom', weight: 60 },
{ value: 'pi_dad', weight: 20 },
{ value: 'pi_both', weight: 12 },
{ value: 'pi_single', weight: 5 },
{ value: 'pi_grand', weight: 3 },
]);
tags.push(role);
// 城市线级
const city = weightedPick([
{ value: 'ct_tier1', weight: 15 },
{ value: 'ct_new_tier1', weight: 25 },
{ value: 'ct_tier2', weight: 30 },
{ value: 'ct_tier3', weight: 28 },
{ value: 'ct_overseas', weight: 2 },
]);
tags.push(city);
// 收入分布与城市强相关
let income;
if (city === 'ct_tier1' || city === 'ct_overseas') {
income = weightedPick([{value:'inc_high',w:30}, {value:'inc_mid_high',w:40}, {value:'inc_mid',w:20}, {value:'inc_low',w:10}]);
} else if (city === 'ct_tier3') {
income = weightedPick([{value:'inc_high',w:5}, {value:'inc_mid_high',w:15}, {value:'inc_mid',w:40}, {value:'inc_low',w:40}]);
} else {
income = weightedPick([{value:'inc_high',w:10}, {value:'inc_mid_high',w:30}, {value:'inc_mid',w:40}, {value:'inc_low',w:20}]);
}
tags.push(income.value || income); // Handle object parsing if needed from previous logic, wait obj is {value, w}, let's fix weightedPick logic for inline
// Re-define for scope safety:
const getIncome = (city) => {
if (city === 'ct_tier1' || city === 'ct_overseas') return weightedPick([{value:'inc_high',weight:30}, {value:'inc_mid_high',weight:40}, {value:'inc_mid',weight:20}, {value:'inc_low',weight:10}]);
if (city === 'ct_tier3') return weightedPick([{value:'inc_high',weight:5}, {value:'inc_mid_high',weight:15}, {value:'inc_mid',weight:40}, {value:'inc_low',weight:40}]);
return weightedPick([{value:'inc_high',weight:10}, {value:'inc_mid_high',weight:30}, {value:'inc_mid',weight:40}, {value:'inc_low',weight:20}]);
}
const actualIncome = getIncome(city);
tags.push(actualIncome);
// 子女数量
const childCount = weightedPick([
{ value: 'cc_one', weight: 55 },
{ value: 'cc_two', weight: 40 },
{ value: 'cc_multi', weight: 5 },
]);
tags.push(childCount);
if (childCount === 'cc_two' || childCount === 'cc_multi') {
if (Math.random() < 0.3) tags.push('cc_cross');
}
// 学段及年级
const stage = weightedPick([
{ value: 'cs_mid', weight: 60 },
{ value: 'cs_high', weight: 40 },
]);
tags.push(stage);
if (stage === 'cs_mid') {
tags.push(weightedPick([{ value: 'cg_mid1', weight: 35 }, { value: 'cg_mid2', weight: 35 }, { value: 'cg_mid3', weight: 30 }]));
} else {
tags.push(weightedPick([{ value: 'cg_high1', weight: 40 }, { value: 'cg_high2', weight: 35 }, { value: 'cg_high3', weight: 25 }]));
}
// 多孩家庭大概率增加另一个年级标签跨界
if ((childCount === 'cc_two' || childCount === 'cc_multi') && Math.random() < 0.7) {
tags.push(pick(['cs_mid', 'cs_high']));
tags.push(pick(['cg_mid1', 'cg_mid2', 'cg_mid3', 'cg_high1', 'cg_high2', 'cg_high3']));
}
if (Math.random() < 0.25) tags.push('cs_transition');
const prefWeights = [];
if (actualIncome === 'inc_high' || city === 'ct_tier1') {
prefWeights.push({ value: 'sp_top', weight: 30 }, { value: 'sp_abroad', weight: 20 }, { value: 'sp_base', weight: 30 }, { value: 'sp_self', weight: 15 }, { value: 'sp_art', weight: 5 });
} else {
prefWeights.push({ value: 'sp_base', weight: 60 }, { value: 'sp_top', weight: 15 }, { value: 'sp_self', weight: 15 }, { value: 'sp_art', weight: 8 }, { value: 'sp_abroad', weight: 2 });
}
tags.push(weightedPick(prefWeights));
if (Math.random() < 0.6) tags.push(pick(['sp_base', 'sp_self', 'sp_top']));
// 薄弱学科 (多选2-3个以提高自然重合度)
tags.push(weightedPick([{ value: 'sw_math', weight: 35 }, { value: 'sw_english', weight: 25 }, { value: 'sw_science', weight: 20 }, { value: 'sw_arts', weight: 10 }, { value: 'sw_chinese', weight: 10 }]));
if (Math.random() < 0.7) tags.push(pick(['sw_math', 'sw_english', 'sw_science', 'sw_arts']));
if (Math.random() < 0.3) tags.push(pick(['sw_chinese', 'sw_arts', 'sw_science']));
// 学校类型
if (city === 'ct_tier3') {
tags.push(weightedPick([{value:'st_normal',weight:50}, {value:'st_town',weight:40}, {value:'st_key',weight:10}]));
} else {
const stWeights = [{value:'st_key',weight:30}, {value:'st_normal',weight:50}, {value:'st_private',weight:15}];
if (actualIncome === 'inc_high') stWeights.push({value:'st_intl',weight:15});
tags.push(weightedPick(stWeights));
}
// 家长职业 (可能父母职业不同,有概率选出两个)
if (role === 'pi_mom' && Math.random() < 0.2) {
tags.push('pj_fulltime');
} else {
tags.push(weightedPick([
{ value: 'pj_gov', weight: 25 }, { value: 'pj_corp', weight: 30 }, { value: 'pj_biz', weight: 15 }, { value: 'pj_free', weight: 10 }, { value: 'pj_worker', weight: 20 }
]));
}
if (Math.random() < 0.4) {
tags.push(pick(['pj_gov', 'pj_corp', 'pj_biz', 'pj_free']));
}
// 活跃特征 - 基于用户画像智能分配(改进版)
let engWeights;
// 高收入 + 培优拔高 → 大概率日活
if ((actualIncome === 'inc_high' || actualIncome === 'inc_mid_high') && tags.includes('sp_top')) {
engWeights = [
{ value: 'eng_active_daily', weight: 40 },
{ value: 'eng_weekend', weight: 30 },
{ value: 'eng_exam', weight: 20 },
{ value: 'eng_dormant', weight: 10 }
];
}
// 全职妈妈 → 高概率日活
else if (tags.includes('pj_fulltime')) {
engWeights = [
{ value: 'eng_active_daily', weight: 50 },
{ value: 'eng_weekend', weight: 25 },
{ value: 'eng_exam', weight: 15 },
{ value: 'eng_dormant', weight: 10 }
];
}
// 体制内/国企 → 中等日活概率
else if (tags.includes('pj_gov')) {
engWeights = [
{ value: 'eng_active_daily', weight: 30 },
{ value: 'eng_weekend', weight: 35 },
{ value: 'eng_exam', weight: 25 },
{ value: 'eng_dormant', weight: 10 }
];
}
// 高收入用户整体活跃度高
else if (actualIncome === 'inc_high') {
engWeights = [
{ value: 'eng_active_daily', weight: 35 },
{ value: 'eng_weekend', weight: 30 },
{ value: 'eng_exam', weight: 25 },
{ value: 'eng_dormant', weight: 10 }
];
}
// 其他情况
else {
engWeights = [
{ value: 'eng_active_daily', weight: 20 },
{ value: 'eng_weekend', weight: 35 },
{ value: 'eng_exam', weight: 30 },
{ value: 'eng_dormant', weight: 15 }
];
}
tags.push(weightedPick(engWeights));
if (Math.random() < 0.4) tags.push('eng_exam');
// 付费会员:高收入、培优拔高、日活用户更可能付费
if (actualIncome === 'inc_high' || tags.includes('sp_top') || tags.includes('eng_active_daily')) {
if (Math.random() < 0.5) tags.push('eng_paid');
} else if (Math.random() < 0.15) {
tags.push('eng_paid');
}
// 设备 (跨设备活跃十分常见)
tags.push(weightedPick([{ value: 'dv_ios', weight: 40 }, { value: 'dv_android', weight: 50 }, { value: 'dv_pc', weight: 10 }]));
if (Math.random() < 0.5) tags.push('dv_pad');
if (Math.random() < 0.3) tags.push('dv_pc');
if (Math.random() < 0.3) tags.push(pick(['dv_ios', 'dv_android']));
// 去重 (防止 push 重复 tag 导致 SQLite Unique 报错虽被 IGNORE但尽量在内存中干净)
return [...new Set(tags)];
}
seedData();