Update README and project cleanup

This commit is contained in:
inkling
2026-04-08 14:52:09 +08:00
commit fafd267288
71 changed files with 14865 additions and 0 deletions

View File

@@ -0,0 +1,673 @@
/**
* 新数据导入脚本 v3.0
* 基于"清洗1.0.xlsx"的完整标签体系
*
* 标签体系49个标签分为5个维度
* 用法: node scripts/import-clean-data.js
*/
const ExcelJS = require('exceljs');
const path = require('path');
const { getDb, initializeDatabase } = require('../db/init');
const EXCEL_FILE = path.join(__dirname, '../清洗1.0.xlsx');
// ════════════════════════════════════════════════════════════════════════════
// 标签分类定义 v3.0 - 49个标签 5个维度
// ════════════════════════════════════════════════════════════════════════════
const TAG_CATEGORIES = [
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第一维度:监护人信息 (19个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'guardian_role',
name: '监护人身份',
color: '#3b82f6',
columns: [1], // A: 家庭角色
type: 'discrete'
},
{
key: 'guardian_education',
name: '文化程度',
color: '#6366f1',
columns: [2], // B: 文化程度
type: 'discrete'
},
{
key: 'guardian_occupation',
name: '职业与经济地位',
color: '#8b5cf6',
columns: [3], // C: 职业
type: 'discrete'
},
{
key: 'guardian_age_group',
name: '监护人年龄段',
color: '#a78bfa',
columns: [4], // D: 年龄
type: 'continuous'
},
{
key: 'second_guardian_role',
name: '第二监护人身份',
color: '#c084fc',
columns: [5], // E: 家庭角色_2
type: 'discrete'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第二维度:孩子信息 (13个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'child_gender',
name: '孩子性别',
color: '#ec4899',
columns: [6], // F: 性别
type: 'discrete'
},
{
key: 'child_grade',
name: '孩子学段',
color: '#f472b6',
columns: [7], // G: 年级
type: 'discrete'
},
{
key: 'child_academic_score',
name: '学习成绩',
color: '#f97316',
columns: [8], // H: 学习成绩
type: 'discrete'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第三维度:家庭环境 (8个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'family_structure',
name: '家庭结构',
color: '#06b6d4',
columns: [9], // I: 家庭基本情况
type: 'keyword_extract',
keywords: ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家']
},
{
key: 'parent_child_relationship',
name: '亲子关系',
color: '#0891b2',
columns: [10], // J: 亲子关系
type: 'text'
},
{
key: 'child_living_with_parents',
name: '与父母同住情况',
color: '#10b981',
columns: [14], // N: 孩子是否在父母身边长大
type: 'yes_no'
},
{
key: 'child_caregivers',
name: '参与养育人员',
color: '#059669',
columns: [15], // O: 还有谁参与孩子的养育
type: 'text'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第四维度:教育风险 (6个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'education_consensus',
name: '教育理念一致性',
color: '#f59e0b',
columns: [11], // K: 家长有无教育分歧
type: 'yes_no'
},
{
key: 'child_negation',
name: '否定孩子情况',
color: '#d97706',
columns: [12], // L: 是否经常否定孩子
type: 'yes_no'
},
{
key: 'physical_punishment',
name: '打骂教育',
color: '#dc2626',
columns: [13], // M: 有无打骂教育
type: 'yes_no'
},
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
// 第五维度:服务方案 (3个标签)
// ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
{
key: 'service_duration',
name: '服务周期',
color: '#7c3aed',
columns: [17], // Q: 天数
type: 'discrete'
}
];
// 值映射与标准化规则
const TAG_MAPPINGS = {
'guardian_role': {
'母亲': '母亲',
'妈妈': '母亲',
'母': '母亲',
'父亲': '父亲',
'爸爸': '父亲',
'奶奶': '祖母',
'祖母': '祖母',
'爷爷': '祖父',
'外婆': '外祖母',
'外公': '外祖父',
'姥姥': '外祖母',
'姥爷': '外祖父',
'舅舅': '其他亲属',
'妻子': '其他亲属',
'大姐': '其他亲属'
},
'guardian_education': {
'初小': '小学',
'小学': '小学',
'初中': '初中',
'中师': '中专',
'中专': '中专',
'高中': '高中',
'大专': '大专',
'大学': '本科',
'本科': '本科',
'大学本科': '本科',
'硕士': '硕士及以上',
'研究生': '硕士及以上',
'在职研究生': '硕士及以上'
},
'child_gender': {
'男': '男孩',
'女': '女孩',
'女、男': '双胞胎'
},
'child_academic_score': {
'优秀': '优秀',
'良好': '良好',
'一般': '一般',
'差': '较差'
},
'child_living_with_parents': {
'是': '是',
'是的': '是',
'在': '是',
'否': '否',
'没有': '否',
'不是': '否'
},
'education_consensus': {
'有': '有分歧',
'是': '有分歧',
'否': '无分歧',
'无': '无分歧',
'没有': '无分歧'
},
'child_negation': {
'是': '是',
'有': '是',
'是的': '是',
'经常': '是',
'否': '否',
'无': '否',
'没有': '否',
'偶尔': '否'
},
'physical_punishment': {
'有': '有',
'是': '有',
'有过': '有',
'偶尔有': '有',
'无': '无',
'没有': '无',
'否': '无',
'基本上没有': '无'
},
'service_duration': {
'60天': '60天课程',
'90天': '90天课程',
'180天': '180天课程'
}
};
// 年龄分组
function getAgeGroup(age) {
if (!age || isNaN(age)) return '年龄未知';
const ageNum = parseInt(age);
if (ageNum < 25) return '25岁以下';
else if (ageNum < 35) return '25-35岁';
else if (ageNum < 45) return '35-45岁';
else if (ageNum < 55) return '45-55岁';
else if (ageNum < 65) return '55-65岁';
else if (ageNum < 75) return '65-75岁';
else return '75岁以上';
}
// 学段分组
function gradeToSegment(grade) {
if (!grade) return '学段未知';
const gradeStr = String(grade).toLowerCase();
if (gradeStr.includes('一') || gradeStr.includes('1年')) return '小学低段(1-3年级)';
if (gradeStr.includes('二') || gradeStr.includes('2年')) return '小学低段(1-3年级)';
if (gradeStr.includes('三') || gradeStr.includes('3年')) return '小学低段(1-3年级)';
if (gradeStr.includes('四') || gradeStr.includes('4年')) return '小学高段(4-6年级)';
if (gradeStr.includes('五') || gradeStr.includes('5年')) return '小学高段(4-6年级)';
if (gradeStr.includes('六') || gradeStr.includes('6年')) return '小学高段(4-6年级)';
if (gradeStr.includes('初一')) return '初中前期(初一初二)';
if (gradeStr.includes('初二') || gradeStr.includes('准初')) return '初中前期(初一初二)';
if (gradeStr.includes('初三') || gradeStr.includes('九年')) return '初中毕业班(初三)';
if (gradeStr.includes('高一')) return '高中前期(高一高二)';
if (gradeStr.includes('高二')) return '高中前期(高一高二)';
if (gradeStr.includes('高三')) return '高中毕业班(高三)';
return '学段未知';
}
// 亲子关系分类
function relationshipQuality(text) {
if (!text) return '未指定';
const lowerText = String(text).toLowerCase();
if (lowerText.includes('良好') || lowerText.includes('好') ||
lowerText.includes('和谐') || lowerText.includes('可以') ||
lowerText.includes('还好') || lowerText.includes('较好') ||
lowerText.includes('还可以')) {
return '亲子关系良好';
}
if (lowerText.includes('一般') || lowerText.includes('还行') ||
lowerText.includes('正常') || lowerText.includes('时好时坏')) {
return '亲子关系一般';
}
if (lowerText.includes('不好') || lowerText.includes('差') ||
lowerText.includes('紧张')) {
return '亲子关系较差';
}
return '亲子关系未评估';
}
async function importCleanData() {
try {
console.log(`\n📂 读取 Excel 文件: ${EXCEL_FILE}`);
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.readFile(EXCEL_FILE);
const worksheet = workbook.getWorksheet(1);
if (!worksheet) {
throw new Error('找不到工作表');
}
console.log(`📊 总行数: ${worksheet.rowCount}`);
const db = getDb('onion');
// 初始化数据库
initializeDatabase('onion');
// 创建所有标签分类
console.log('🏗️ 建立分类体系...');
const categoryMap = {};
for (const cat of TAG_CATEGORIES) {
const result = db.prepare(`
INSERT OR IGNORE INTO tag_categories (key, name, sort_order, color)
VALUES (?, ?, ?, ?)
`).run(cat.key, cat.name, 0, cat.color || '#6366f1');
const catRecord = db.prepare(`
SELECT id FROM tag_categories WHERE key = ?
`).get(cat.key);
categoryMap[cat.key] = catRecord.id;
}
console.log(`✅ 创建了 ${Object.keys(categoryMap).length} 个分类`);
// 处理数据行
let insertedCount = 0;
const insertUserStmt = db.prepare(`
INSERT OR IGNORE INTO users (uid, name, extra_json)
VALUES (?, ?, ?)
`);
const insertUserTagStmt = db.prepare(`
INSERT OR IGNORE INTO user_tags (user_id, tag_id)
VALUES (?, ?)
`);
const tagCache = {};
function getOrCreateTag(catKey, tagName) {
if (!tagName || !catKey) return null;
const cacheKey = `${catKey}:${tagName}`;
if (tagCache[cacheKey]) return tagCache[cacheKey];
// 先尝试找系统中是否已经有这个标签
let tag = db.prepare(`
SELECT id FROM tags WHERE category_id = ? AND name = ?
`).get(categoryMap[catKey], tagName);
if (!tag) {
// 如果没有生成一个唯一的key
const tagNameNorm = String(tagName).toLowerCase().trim().replace(/\s+/g, '_');
const hashCode = Array.from(tagNameNorm).reduce((h, c) => ((h << 5) - h) + c.charCodeAt(0), 0) & 0xffffff;
let tagKey = `${catKey}_${hashCode.toString(16)}`;
// 检查key冲突
let counter = 1;
while (db.prepare(`SELECT 1 FROM tags WHERE key = ?`).get(tagKey)) {
tagKey = `${catKey}_${hashCode.toString(16)}_${counter}`;
counter++;
}
db.prepare(`
INSERT INTO tags (key, name, category_id, sort_order)
VALUES (?, ?, ?, ?)
`).run(tagKey, tagName, categoryMap[catKey], 0);
tag = db.prepare(`
SELECT id FROM tags WHERE key = ?
`).get(tagKey);
}
tagCache[cacheKey] = tag?.id;
return tag?.id;
}
// 遍历 Excel 数据行
let rowCount = 0;
worksheet.eachRow((row, rowNumber) => {
if (rowNumber === 1) return; // 跳过表头
rowCount++;
const values = row.values || [];
// 提取基本信息
const uid = `user_${rowNumber - 1}`; // 简单的用户ID
const guardianRole = values[1];
const childGrade = values[7];
const childDesc = values[16];
if (!guardianRole) {
console.warn(`⚠️ 行 ${rowNumber} 缺少监护人身份,跳过`);
return;
}
// 构建用户额外数据
const extraData = {
row: rowNumber,
guardianRole: guardianRole,
childGrade: childGrade,
childDescription: childDesc ? String(childDesc).substring(0, 500) : ''
};
// 插入用户
const result = insertUserStmt.run(uid, String(guardianRole), JSON.stringify(extraData));
if (result.changes > 0) {
insertedCount++;
const userId = result.lastInsertRowid;
// 为用户添加标签
addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap);
if (rowCount % 30 === 0) {
console.log(` 📝 已处理 ${rowCount} 行...`);
}
}
});
console.log(`\n✅ 用户导入完成:${insertedCount}`);
// 更新所有标签的覆盖统计
console.log('🔄 更新标签统计...');
updateTagStats(db);
console.log('\n📊 数据统计:');
const stats = db.prepare(`
SELECT
(SELECT COUNT(*) FROM users) as total_users,
(SELECT COUNT(*) FROM tags) as total_tags,
(SELECT COUNT(*) FROM tag_categories) as total_categories
`).get();
console.log(` • 总用户: ${stats.total_users}`);
console.log(` • 总标签: ${stats.total_tags}`);
console.log(` • 分类数: ${stats.total_categories}`);
db.close();
console.log('\n🎉 导入流程完成!\n');
} catch (error) {
console.error('❌ 导入失败:', error.message);
console.error(error.stack);
process.exit(1);
}
}
function addUserTags(userId, values, rowNumber, getOrCreateTag, insertUserTagStmt, categoryMap) {
// 监护人身份
if (values[1]) {
const role = String(values[1]).trim();
const mapped = TAG_MAPPINGS.guardian_role[role] || role;
const tagId = getOrCreateTag('guardian_role', mapped);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 监护人身份: "${role}" -> "${mapped}" (tagId: ${tagId})`);
}
// 文化程度
if (values[2]) {
const edu = String(values[2]).trim();
const mapped = TAG_MAPPINGS.guardian_education[edu] || edu;
const tagId = getOrCreateTag('guardian_education', mapped);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 文化程度: "${edu}" -> "${mapped}" (tagId: ${tagId})`);
}
// 职业(分类)
if (values[3]) {
const job = String(values[3]).trim().toLowerCase();
let jobCategory = '其他';
// 简单的职业分类
if (job.includes('教师') || job.includes('医生') || job.includes('工程') || job.includes('律师')) {
jobCategory = '专业人士';
} else if (job.includes('工人') || job.includes('工厂')) {
jobCategory = '工人';
} else if (job.includes('农') || job.includes('农民') || job.includes('务农')) {
jobCategory = '农民';
} else if (job.includes('员工') || job.includes('职员') || job.includes('公务') || job.includes('干部')) {
jobCategory = '公司/政府工作人员';
} else if (job.includes('退休') || job.includes('离退休')) {
jobCategory = '退休人士';
} else if (job.includes('个体') || job.includes('自由') || job.includes('经营')) {
jobCategory = '个体户/自由职业';
} else if (job.includes('商业') || job.includes('销售')) {
jobCategory = '销售/商业';
} else if (job.includes('家')) {
jobCategory = '家务';
}
const tagId = getOrCreateTag('guardian_occupation', jobCategory);
if (tagId !== null && tagId !== undefined) insertUserTagStmt.run(userId, tagId);
if (rowNumber <= 5) console.log(` [行${rowNumber}] 职业: "${job}" -> "${jobCategory}" (tagId: ${tagId})`);
}
// 年龄分组
if (values[4]) {
const ageGroup = getAgeGroup(values[4]);
const tagId = getOrCreateTag('guardian_age_group', ageGroup);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 第二监护人身份
if (values[5]) {
const role2 = String(values[5]).trim();
if (role2 && role2 !== '无' && role2 !== '/') {
const mapped = TAG_MAPPINGS.guardian_role[role2] || role2;
const tagId = getOrCreateTag('second_guardian_role', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 孩子性别
if (values[6]) {
const gender = String(values[6]).trim();
const mapped = TAG_MAPPINGS.child_gender[gender] || gender;
const tagId = getOrCreateTag('child_gender', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 孩子学段
if (values[7]) {
const segment = gradeToSegment(values[7]);
const tagId = getOrCreateTag('child_grade', segment);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 学习成绩
if (values[8]) {
const scoreStr = String(values[8]).trim();
// 处理混合值
const scores = scoreStr.split(/[、,]/).map(s => s.trim()).filter(s => s && !s.includes('null'));
for (const score of scores) {
const mapped = TAG_MAPPINGS.child_academic_score[score] || score;
const tagId = getOrCreateTag('child_academic_score', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 家庭结构(关键词提取)
if (values[9]) {
const familyStr = String(values[9]).trim();
const keywords = ['三代同堂', '核心家庭', '隔代抚养', '离异', '单亲', '三口之家', '四口之家', '多代'];
const found = new Set();
for (const kw of keywords) {
if (familyStr.includes(kw) && !found.has(kw)) {
found.add(kw);
const tagId = getOrCreateTag('family_structure', kw);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 如果没有识别任何关键词,用原始值
if (found.size === 0) {
const tagId = getOrCreateTag('family_structure', familyStr.substring(0, 50));
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 亲子关系
if (values[10]) {
const relationship = relationshipQuality(values[10]);
const tagId = getOrCreateTag('parent_child_relationship', relationship);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 教育理念一致性
if (values[11]) {
const consensus = String(values[11]).trim();
const mapped = TAG_MAPPINGS.education_consensus[consensus] || (consensus.includes('有') ? '有分歧' : '无分歧');
const tagId = getOrCreateTag('education_consensus', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 是否否定孩子
if (values[12]) {
const negation = String(values[12]).trim();
const mapped = TAG_MAPPINGS.child_negation[negation] || (negation.includes('是') || negation.includes('有') ? '是' : '否');
const tagId = getOrCreateTag('child_negation', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 打骂教育
if (values[13]) {
const punishment = String(values[13]).trim();
const mapped = TAG_MAPPINGS.physical_punishment[punishment] || (punishment.includes('有') ? '有' : '无');
const tagId = getOrCreateTag('physical_punishment', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 孩子与父母同住
if (values[14]) {
const living = String(values[14]).trim();
// 尝试映射,如果映射失败,尝试关键字匹配
let mapped = TAG_MAPPINGS.child_living_with_parents[living];
if (!mapped) {
// 关键字匹配
if (living.includes('是') && !living.includes('不是')) {
mapped = '是';
} else if (living.includes('否') || living.includes('不是')) {
mapped = '否';
} else {
mapped = '是'; // 默认
}
}
const tagId = getOrCreateTag('child_living_with_parents', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
// 参与养育人员 - 提取关键信息
if (values[15]) {
const caregiverStr = String(values[15]).trim();
if (caregiverStr && caregiverStr !== '无' && caregiverStr !== '没有') {
// 识别主要的养育者
let caregiver = '其他';
if (caregiverStr.includes('妈妈')) caregiver = '母亲';
else if (caregiverStr.includes('父亲') || caregiverStr.includes('爸爸')) caregiver = '父亲';
else if (caregiverStr.includes('爷爷')) caregiver = '祖父';
else if (caregiverStr.includes('奶奶')) caregiver = '祖母';
else if (caregiverStr.includes('外公')) caregiver = '外祖父';
else if (caregiverStr.includes('外婆')) caregiver = '外祖母';
else if (caregiverStr.includes('祖')) caregiver = '祖父母';
else if (caregiverStr.includes('外')) caregiver = '外祖父母';
const tagId = getOrCreateTag('child_caregivers', caregiver);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
// 服务周期
if (values[17]) {
const duration = String(values[17]).trim();
const mapped = TAG_MAPPINGS.service_duration[duration] || duration;
const tagId = getOrCreateTag('service_duration', mapped);
if (tagId) insertUserTagStmt.run(userId, tagId);
}
}
function updateTagStats(db) {
const tags = db.prepare(`SELECT id FROM tags`).all();
const totalUsers = db.prepare(`SELECT COUNT(*) as n FROM users`).get().n;
for (const tag of tags) {
const result = db.prepare(`
SELECT COUNT(*) as n FROM user_tags WHERE tag_id = ?
`).get(tag.id);
const coverage = result.n || 0;
const coverageRate = totalUsers > 0 ? (coverage / totalUsers * 100).toFixed(2) : 0;
db.prepare(`
UPDATE tags SET coverage = ?, coverage_rate = ? WHERE id = ?
`).run(coverage, coverageRate, tag.id);
}
}
importCleanData();