Update README and project cleanup

This commit is contained in:
inkling
2026-04-08 14:52:09 +08:00
commit fafd267288
71 changed files with 14865 additions and 0 deletions

View File

@@ -0,0 +1,448 @@
/**
* 清洗3.0 导入脚本 v1.0
*
* 业务约束:
* 1) 参加指导最想解决 缺失时采用保守推断,标签后缀“(推断)”
* 2) 监护人2相关字段不参与建模
* 3) 删除付费能力标签分类
* 4) 全量替换导入
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗3.0.xlsx');
const DB_THEME = 'onion';
const TOTAL_USERS_FALLBACK = 11500;
const TAG_CATEGORIES = [
{ key: 'basic_info_role', name: '家庭角色', color: '#d97706' },
{ key: 'user_age_group', name: '用户年龄段标签', color: '#6366f1' },
{ key: 'child_grade', name: '孩子学段标签', color: '#8b5cf6' },
{ key: 'family_structure', name: '家庭结构标签', color: '#a78bfa' },
{ key: 'education_risk', name: '教育风险标签', color: '#c084fc' },
{ key: 'family_support', name: '家庭支持度标签', color: '#ec4899' },
{ key: 'urgency', name: '需求紧迫度标签', color: '#f97316' },
{ key: 'core_problem', name: '核心问题标签', color: '#06b6d4' },
{ key: 'intervention_difficulty', name: '干预难度标签', color: '#0891b2' },
{ key: 'conversion_priority', name: '转化优先级标签', color: '#10b981' },
{ key: 'channel_adaption', name: '渠道适配标签', color: '#059669' },
{ key: 'product_match', name: '产品匹配标签', color: '#f59e0b' },
{ key: 'basic_info_education', name: '文化程度', color: '#dc2626' },
{ key: 'service_duration', name: '服务周期标签', color: '#7c3aed' }
];
function text(v) {
if (v === undefined || v === null) return '';
return String(v).replace(/\s+/g, ' ').trim();
}
function parseNumber(v) {
if (v === undefined || v === null || v === '') return null;
const raw = String(v).replace(/[^\d.\-]/g, '');
if (!raw) return null;
const n = Number(raw);
return Number.isFinite(n) ? n : null;
}
function splitMulti(v) {
const s = text(v);
if (!s) return [];
return s
.split(/[、,;/|]+/)
.map((item) => item.trim())
.filter(Boolean);
}
function normalizeFamilyAtmosphere(v) {
const s = text(v);
if (!s) return '中立';
const warm = ['和谐', '温暖', '支持', '理解', '亲密', '关心', '融洽', '良好'];
const cold = ['冷漠', '疏离', '冷战', '忽视', '回避', '压抑', '隔阂'];
const conflict = ['争吵', '冲突', '矛盾', '紧张', '对立', '不和'];
const neutral = ['一般', '普通', '还行', '尚可', '平常'];
const hit = (dict) => dict.some((k) => s.includes(k));
if (hit(cold) || hit(conflict)) return '冷漠';
if (hit(warm)) return '温暖';
if (hit(neutral)) return '中立';
return '中立';
}
function normalizeParentChild(v) {
const s = text(v);
if (!s) return '中立';
if (/(紧张|疏离|冲突|差|糟)/.test(s)) return '紧张';
if (/(良好|亲密|和谐|较好|很好)/.test(s)) return '良好';
return '中立';
}
function normalizeRole(v) {
const s = text(v);
if (!s) return '';
if (/(妈妈|母亲|妈咪)/.test(s)) return '妈妈';
if (/(爸爸|父亲)/.test(s)) return '父亲';
if (/(奶奶|祖母)/.test(s)) return '奶奶';
if (/(爷爷|祖父)/.test(s)) return '爷爷';
if (/(姥姥|外婆)/.test(s)) return '姥姥/外婆';
return s;
}
function ageToTag(age) {
if (age == null) return '';
if (age < 25) return '25岁以下';
if (age < 35) return '25-34岁';
if (age < 45) return '35-44岁';
if (age < 55) return '45-54岁';
return '55岁及以上';
}
function normalizeGrade(v) {
const s = text(v);
if (!s) return '';
if (/幼/.test(s)) return '幼儿园';
if (/(小|一年级|二年级|三年级|四年级|五年级|六年级)/.test(s)) return '小学';
if (/(初一|初二|初三|初中)/.test(s)) return '初中';
if (/(高一|高二|高三|高中)/.test(s)) return '高中';
if (/(大学|大一|大二|大三|大四)/.test(s)) return '大学';
return s;
}
function normalizeScore(v) {
const s = text(v);
if (!s) return '一般';
if (/(优秀|优异|很好|拔尖)/.test(s)) return '优秀';
if (/(良好|较好|不错)/.test(s)) return '良好';
if (/(差|不理想|偏下|落后|薄弱)/.test(s)) return '较差';
return '一般';
}
function inferCoreProblem(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const atmosphere = normalizeFamilyAtmosphere(row['家庭氛围']);
const relation = normalizeParentChild(row['亲子关系']);
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
const majorEvent = text(row['重大影响事件_扩展']);
if (score === '较差') return '学习动力与执行(推断)';
if (/(有|是|存在|经常)/.test(negate) || /(有|是|存在|经常)/.test(physical)) {
return '教养方式调整(推断)';
}
if (atmosphere === '冷漠' || relation === '紧张' || /(有|是|分歧)/.test(divergence)) {
return '亲子沟通修复(推断)';
}
if (/(离异|变故|创伤|重大)/.test(majorEvent)) {
return '情绪与安全感支持(推断)';
}
return '阶段性成长支持(推断)';
}
function inferEducationRisk(row) {
const risk = [];
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
const withParents = text(row['孩子是否在父母身边长大']);
if (/(有|是|分歧|不一致)/.test(divergence)) risk.push('教育理念分歧');
if (/(有|是|经常|总是)/.test(negate)) risk.push('否定式沟通风险');
if (/(有|是|打|骂|体罚)/.test(physical)) risk.push('惩罚式教育风险');
if (/(否|不在|老人|寄养|留守)/.test(withParents)) risk.push('陪伴不足风险');
return risk;
}
function inferFamilyStructure(row) {
const tags = [];
const basic = text(row['家庭基本情况_规范'] || row['家庭基本情况']);
const withParents = text(row['孩子是否在父母身边长大']);
const caregivers = text(row['还有谁参与孩子的养育']);
if (/单亲|离异/.test(basic)) tags.push('单亲家庭');
if (/重组/.test(basic)) tags.push('重组家庭');
if (/三代同堂|隔代|祖/.test(basic) || /爷爷|奶奶|姥姥|外婆|祖/.test(caregivers)) tags.push('隔代参与家庭');
if (/(否|不在|寄养|留守)/.test(withParents)) tags.push('分离养育家庭');
if (!tags.length) tags.push('常规家庭结构');
return tags;
}
function inferUrgency(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const relation = normalizeParentChild(row['亲子关系']);
const physical = text(row['有无打骂教育']);
if (score === '较差' || relation === '紧张' || /(有|是|打|骂)/.test(physical)) return '高紧迫度';
if (score === '一般') return '中紧迫度';
return '低紧迫度';
}
function inferInterventionDifficulty(row) {
let score = 0;
const relation = normalizeParentChild(row['亲子关系']);
const divergence = text(row['家长有无教育分歧']);
const negate = text(row['是否经常否定孩子']);
const physical = text(row['有无打骂教育']);
if (relation === '紧张') score += 2;
if (/(有|是|分歧)/.test(divergence)) score += 1;
if (/(有|是|经常)/.test(negate)) score += 1;
if (/(有|是|打|骂)/.test(physical)) score += 2;
if (score >= 4) return '高干预难度';
if (score >= 2) return '中干预难度';
return '低干预难度';
}
function inferConversionPriority(row) {
const urgency = inferUrgency(row);
const diff = inferInterventionDifficulty(row);
if (urgency === '高紧迫度' && diff !== '高干预难度') return '高优先级';
if (urgency === '高紧迫度' || diff === '中干预难度') return '中优先级';
return '低优先级';
}
function inferChannelAdaption(row) {
const q = text(row['问卷评估']);
if (!q) return '标准沟通';
if (/(线上|微信|视频)/.test(q)) return '线上沟通优先';
if (/(线下|到访|面谈)/.test(q)) return '线下面谈优先';
return '标准沟通';
}
function inferProductMatch(row) {
const score = normalizeScore(row['学习成绩_规范'] || row['学习成绩']);
const relation = normalizeParentChild(row['亲子关系']);
if (score === '较差' && relation === '紧张') return '综合干预方案';
if (score === '较差') return '学习提升方案';
if (relation === '紧张') return '亲子沟通方案';
return '成长支持方案';
}
function inferServiceDuration(row) {
const urgency = inferUrgency(row);
const difficulty = inferInterventionDifficulty(row);
if (urgency === '高紧迫度' || difficulty === '高干预难度') return '12周';
if (urgency === '中紧迫度') return '8周';
return '4周';
}
function updateTagStats(db) {
const totalUsers = db.prepare('SELECT COUNT(*) as n FROM users').get().n || TOTAL_USERS_FALLBACK;
const allTags = db.prepare('SELECT id FROM tags').all();
const stmt = db.prepare(`
UPDATE tags SET
coverage = (SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id),
coverage_rate = ROUND((SELECT COUNT(DISTINCT user_id) FROM user_tags WHERE tag_id = tags.id) * 100.0 / ?, 2)
WHERE id = ?
`);
allTags.forEach((t) => stmt.run(totalUsers, t.id));
}
async function main() {
console.log('\n🚀 清洗3.0 导入流程 v1.0\n');
initializeDatabase(DB_THEME);
const db = getDb(DB_THEME);
try {
db.pragma('foreign_keys = OFF');
console.log('🗑️ 清空旧数据...');
db.prepare('DELETE FROM user_tags').run();
db.prepare('DELETE FROM users').run();
db.prepare('DELETE FROM tags').run();
db.prepare('DELETE FROM tag_categories').run();
const categoryMap = {};
const insertCategoryStmt = db.prepare(`
INSERT INTO tag_categories (key, name, color, sort_order)
VALUES (?, ?, ?, ?)
`);
TAG_CATEGORIES.forEach((cat, idx) => {
const result = insertCategoryStmt.run(cat.key, cat.name, cat.color, idx);
categoryMap[cat.key] = result.lastInsertRowid;
});
console.log(`✅ 已创建 ${TAG_CATEGORIES.length} 个分类(已删除付费能力)`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.worksheets[0];
console.log(`📖 读取 ${worksheet.name} | 行: ${worksheet.rowCount} | 列: ${worksheet.columnCount}`);
const headerRow = worksheet.getRow(1);
const headers = {};
headerRow.eachCell((cell, colNumber) => {
headers[text(cell.value)] = colNumber;
});
const needHeaders = [
'家庭角色', '文化程度', '年龄_数值', '年龄_2_数值', '年级_规范', '学习成绩_规范',
'家庭基本情况_规范', '家庭氛围', '亲子关系', '家长有无教育分歧', '是否经常否定孩子',
'有无打骂教育', '孩子是否在父母身边长大', '还有谁参与孩子的养育',
'重大影响事件_扩展', '参加指导最想解决_扩展', '问卷评估', '文件名称'
];
const missing = needHeaders.filter((h) => !headers[h]);
if (missing.length) {
throw new Error(`缺少关键表头: ${missing.join(', ')}`);
}
const insertUserStmt = db.prepare('INSERT INTO users (uid, name, extra_json) VALUES (?, ?, ?)');
const insertTagStmt = db.prepare('INSERT INTO tags (key, name, category_id, coverage, coverage_rate, sort_order) VALUES (?, ?, ?, 0, 0, 0)');
const insertUserTagStmt = db.prepare('INSERT OR IGNORE INTO user_tags (user_id, tag_id) VALUES (?, ?)');
const tagCache = new Map();
function getOrCreateTag(catKey, tagName) {
const n = text(tagName);
if (!n) return null;
const cacheKey = `${catKey}:${n}`;
if (tagCache.has(cacheKey)) return tagCache.get(cacheKey);
let tag = db.prepare('SELECT id FROM tags WHERE category_id = ? AND name = ?').get(categoryMap[catKey], n);
if (!tag) {
const key = `${catKey}_${Math.random().toString(36).slice(2, 10)}`;
const result = insertTagStmt.run(key, n, categoryMap[catKey]);
tag = { id: result.lastInsertRowid };
}
tagCache.set(cacheKey, tag.id);
return tag.id;
}
let rowCount = 0;
let inserted = 0;
let inferredCoreCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return;
rowCount += 1;
const rowObj = {};
for (const [name, idx] of Object.entries(headers)) {
rowObj[name] = row.getCell(idx).value;
}
const role = normalizeRole(rowObj['家庭角色']);
if (!role) return;
const fileName = text(rowObj['文件名称']);
const safeFileName = fileName.replace(/\s+/g, '_').slice(0, 60);
const uid = fileName ? `u_${safeFileName}_${rowNumber}` : `u_row_${rowNumber}`;
const userExtra = {
rowNumber,
inferredCore: false,
source: 'clean3.0'
};
const result = insertUserStmt.run(uid, uid, JSON.stringify(userExtra));
if (!result.changes) return;
inserted += 1;
const userId = result.lastInsertRowid;
const addTag = (catKey, tagName) => {
const tagId = getOrCreateTag(catKey, tagName);
if (tagId) insertUserTagStmt.run(userId, tagId);
};
// 基础标签
addTag('basic_info_role', role);
addTag('basic_info_education', text(rowObj['文化程度']));
// 年龄段监护人1 + 监护人2数值年龄合并但不使用监护人2其他字段
const age1 = parseNumber(rowObj['年龄_数值']);
const age2 = parseNumber(rowObj['年龄_2_数值']);
addTag('user_age_group', ageToTag(age1));
addTag('user_age_group', ageToTag(age2));
// 学段
addTag('child_grade', normalizeGrade(rowObj['年级_规范']));
// 家庭结构
inferFamilyStructure(rowObj).forEach((t) => addTag('family_structure', t));
// 教育风险
inferEducationRisk(rowObj).forEach((t) => addTag('education_risk', t));
// 家庭支持度3类氛围 + 亲子关系)
addTag('family_support', `家庭氛围-${normalizeFamilyAtmosphere(rowObj['家庭氛围'])}`);
addTag('family_support', `亲子关系-${normalizeParentChild(rowObj['亲子关系'])}`);
// 紧迫度、难度、优先级
addTag('urgency', inferUrgency(rowObj));
addTag('intervention_difficulty', inferInterventionDifficulty(rowObj));
addTag('conversion_priority', inferConversionPriority(rowObj));
// 渠道/产品/周期
addTag('channel_adaption', inferChannelAdaption(rowObj));
addTag('product_match', inferProductMatch(rowObj));
addTag('service_duration', inferServiceDuration(rowObj));
// 核心问题:优先原始扩展,否则保守推断 + (推断)
const originCore = splitMulti(rowObj['参加指导最想解决_扩展']);
if (originCore.length) {
originCore.forEach((tag) => addTag('core_problem', tag));
} else {
const inferred = inferCoreProblem(rowObj);
addTag('core_problem', inferred);
inferredCoreCount += 1;
}
if (rowCount % 500 === 0) {
console.log(` ✓ 已处理 ${rowCount}`);
}
});
console.log(`\n✅ 导入用户: ${inserted}`);
console.log(`✅ 核心问题推断数: ${inferredCoreCount}`);
updateTagStats(db);
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories,
(SELECT COUNT(*) FROM user_tags) as total_rels
`).get();
console.log('\n📊 结果统计');
console.log(` • 用户数: ${stats.total_users}`);
console.log(` • 标签数: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
console.log(` • 关系数: ${stats.total_rels}`);
const deletedPayment = db.prepare('SELECT COUNT(*) as n FROM tag_categories WHERE key = ?').get('payment_ability').n;
console.log(` • 付费能力分类存在数: ${deletedPayment}`);
const inferredTags = db.prepare(`
SELECT COUNT(*) as n FROM tags t
JOIN tag_categories c ON c.id = t.category_id
WHERE c.key = 'core_problem' AND t.name LIKE '%(推断)'
`).get().n;
console.log(` • 推断核心问题标签种类: ${inferredTags}`);
db.pragma('foreign_keys = ON');
db.close();
console.log('\n🎉 清洗3.0导入完成\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
try { db.close(); } catch (_) {}
process.exit(1);
}
}
main();