From 37da5da0446821784654533124162ae652dcd9e5 Mon Sep 17 00:00:00 2001 From: JiaoTianBo Date: Mon, 30 Mar 2026 17:43:29 +0800 Subject: [PATCH] =?UTF-8?q?refactor(ai):=20=E5=90=88=E5=B9=B6ai=5Fdocument?= =?UTF-8?q?=E8=A1=A8=E5=88=B0vector=5Fstore=E8=A1=A8=EF=BC=8C=E5=88=87?= =?UTF-8?q?=E6=8D=A2=E6=96=87=E6=A1=A3ID=E7=B1=BB=E5=9E=8B=E4=B8=BA?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E4=B8=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 删除旧的ai_document表及相关索引,新增vector_store表兼容Spring AI PgVectorStore结构 - 调整实体类AiDocument映射到vector_store表,使用字符串ID代替Long类型 - 修改Mapper接口及XML中所有ID相关SQL使用字符串类型,并替换表名为vector_store - 修改服务接口与实现类,文档ID参数类型统一为字符串 - 处理文档分块时改用UUID生成chunk ID,确保唯一且格式正确 - 禁用Spring Ai PgVectorStore的自动schema初始化,使用手动创建的表结构 - 更新配置文件OpenAI模型API key及基础URL配置,支持多模型与聊天功能 - 优化日志输出,增加分块文档ID和父文档ID显示,方便调试追踪 --- docs/dev-ops/pgsql/sql/weform_run.sql | 228 +++++++++--------- .../common/config/SpringAiConfig.java | 2 +- .../yinlihupo/domain/entity/AiDocument.java | 18 +- .../cn/yinlihupo/domain/vo/KbDocumentVO.java | 4 +- .../yinlihupo/domain/vo/ReferencedDocVO.java | 4 +- .../cn/yinlihupo/mapper/AiDocumentMapper.java | 22 +- .../service/ai/AiKnowledgeBaseService.java | 4 +- .../ai/impl/AiKnowledgeBaseServiceImpl.java | 10 +- .../service/ai/rag/DocumentProcessor.java | 23 +- src/main/resources/application-dev.yaml | 13 +- .../resources/mapper/AiDocumentMapper.xml | 62 +++-- 11 files changed, 202 insertions(+), 188 deletions(-) diff --git a/docs/dev-ops/pgsql/sql/weform_run.sql b/docs/dev-ops/pgsql/sql/weform_run.sql index 7ca9fc4..d7da1b9 100644 --- a/docs/dev-ops/pgsql/sql/weform_run.sql +++ b/docs/dev-ops/pgsql/sql/weform_run.sql @@ -8,6 +8,122 @@ CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +-- ===================================================== +-- Spring AI PgVectorStore 向量存储表 +-- 用于RAG文档向量存储和相似度搜索 +-- 兼容PgVectorStore默认结构,同时支持完整文档管理 +-- ===================================================== +DROP TABLE IF EXISTS vector_store CASCADE; +CREATE TABLE vector_store ( + -- PgVectorStore 核心字段 + id VARCHAR(255) PRIMARY KEY, + content TEXT, + metadata JSONB, + embedding vector(1536), + + -- 关联关系 + project_id BIGINT, + timeline_node_id BIGINT, + kb_id BIGINT, + + -- 文档来源 + source_type VARCHAR(50), + source_id BIGINT, + + -- 文档扩展信息 + title VARCHAR(500), + content_raw TEXT, + summary TEXT, + + -- 文档元数据 + doc_type VARCHAR(50), + language VARCHAR(10) DEFAULT 'zh', + file_type VARCHAR(50), + file_size BIGINT, + file_path VARCHAR(500), + + -- 时间信息 (用于时间维度检索) + doc_date DATE, + doc_datetime TIMESTAMP, + + -- 分块信息(大文档分块存储) + chunk_index INT DEFAULT 0, + chunk_total INT DEFAULT 1, + chunk_parent_id VARCHAR(255), + + -- 标签和分类 + tags JSONB, + category VARCHAR(100), + + -- 使用统计 + view_count INT DEFAULT 0, + query_count INT DEFAULT 0, + last_queried_at TIMESTAMP, + + -- 状态 + status VARCHAR(20) DEFAULT 'active', + error_message TEXT, + + -- 创建信息 + create_by BIGINT, + create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + update_by BIGINT, + update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + deleted SMALLINT DEFAULT 0, + + -- 外键约束 + CONSTRAINT fk_vs_project FOREIGN KEY (project_id) REFERENCES project(id) ON DELETE SET NULL, + CONSTRAINT fk_vs_timeline FOREIGN KEY (timeline_node_id) REFERENCES project_timeline(id) ON DELETE SET NULL, + CONSTRAINT fk_vs_kb FOREIGN KEY (kb_id) REFERENCES ai_knowledge_base(id) ON DELETE SET NULL, + CONSTRAINT fk_vs_create_by FOREIGN KEY (create_by) REFERENCES sys_user(id) ON DELETE SET NULL +); + +-- 创建向量索引(使用IVFFlat算法,适合中等数据量) +CREATE INDEX idx_vector_store_embedding ON vector_store + USING ivfflat (embedding vector_cosine_ops) + WITH (lists = 100); + +-- 创建其他常用索引 +CREATE INDEX idx_vs_project ON vector_store(project_id) WHERE deleted = 0; +CREATE INDEX idx_vs_timeline ON vector_store(timeline_node_id) WHERE deleted = 0; +CREATE INDEX idx_vs_kb ON vector_store(kb_id) WHERE deleted = 0; +CREATE INDEX idx_vs_source ON vector_store(source_type, source_id) WHERE deleted = 0; +CREATE INDEX idx_vs_status ON vector_store(status); +CREATE INDEX idx_vs_type ON vector_store(doc_type); +CREATE INDEX idx_vs_tags ON vector_store USING GIN(tags); +CREATE INDEX idx_vs_chunk_parent ON vector_store(chunk_parent_id) WHERE chunk_parent_id IS NOT NULL; + +COMMENT ON TABLE vector_store IS '向量存储表 - 用于RAG文档检索和知识库管理'; +COMMENT ON COLUMN vector_store.id IS '文档ID(字符串类型,兼容PgVectorStore)'; +COMMENT ON COLUMN vector_store.content IS '文档内容文本'; +COMMENT ON COLUMN vector_store.metadata IS '文档元数据(JSONB格式,兼容PgVectorStore)'; +COMMENT ON COLUMN vector_store.embedding IS '向量嵌入(1536维)'; +COMMENT ON COLUMN vector_store.project_id IS '关联项目ID'; +COMMENT ON COLUMN vector_store.timeline_node_id IS '关联时间节点ID'; +COMMENT ON COLUMN vector_store.kb_id IS '关联知识库ID'; +COMMENT ON COLUMN vector_store.source_type IS '来源类型: project-项目文档, risk-风险文档, ticket-工单, report-日报, upload-上传文件, knowledge-知识库, chat-对话记录'; +COMMENT ON COLUMN vector_store.source_id IS '来源记录ID'; +COMMENT ON COLUMN vector_store.title IS '文档标题'; +COMMENT ON COLUMN vector_store.content_raw IS '原始内容(带格式)'; +COMMENT ON COLUMN vector_store.summary IS 'AI生成的摘要'; +COMMENT ON COLUMN vector_store.doc_type IS '文档类型: requirement-需求, design-设计, plan-计划, report-报告, contract-合同, photo-照片, other-其他'; +COMMENT ON COLUMN vector_store.language IS '语言: zh-中文, en-英文'; +COMMENT ON COLUMN vector_store.file_type IS '文件类型: pdf, doc, txt, md, jpg, png等'; +COMMENT ON COLUMN vector_store.file_size IS '文件大小(字节)'; +COMMENT ON COLUMN vector_store.file_path IS '文件存储路径'; +COMMENT ON COLUMN vector_store.doc_date IS '文档日期(如日报日期、照片拍摄日期)'; +COMMENT ON COLUMN vector_store.doc_datetime IS '文档时间戳'; +COMMENT ON COLUMN vector_store.chunk_index IS '分块序号'; +COMMENT ON COLUMN vector_store.chunk_total IS '总分块数'; +COMMENT ON COLUMN vector_store.chunk_parent_id IS '父文档ID(分块时使用)'; +COMMENT ON COLUMN vector_store.tags IS '标签数组'; +COMMENT ON COLUMN vector_store.category IS '分类'; +COMMENT ON COLUMN vector_store.view_count IS '查看次数'; +COMMENT ON COLUMN vector_store.query_count IS '被检索次数'; +COMMENT ON COLUMN vector_store.last_queried_at IS '最后被检索时间'; +COMMENT ON COLUMN vector_store.status IS '状态: active-可用, processing-处理中, error-错误, archived-归档'; +COMMENT ON COLUMN vector_store.error_message IS '错误信息'; + -- 设置时区 SET timezone = 'Asia/Shanghai'; @@ -969,118 +1085,8 @@ COMMENT ON COLUMN ai_knowledge_base.status IS '状态: active-可用, archived- -- 11. AI服务相关表 -- ===================================================== --- AI文档向量表 (用于RAG知识库) -DROP TABLE IF EXISTS ai_document CASCADE; -CREATE TABLE ai_document ( - id BIGSERIAL PRIMARY KEY, - doc_id UUID DEFAULT uuid_generate_v4(), - - -- 关联关系 - project_id BIGINT, - timeline_node_id BIGINT, - kb_id BIGINT, - -- 文档来源 - source_type VARCHAR(50) NOT NULL, - source_id BIGINT, - - -- 文档内容 - title VARCHAR(500), - content TEXT NOT NULL, - content_raw TEXT, - summary TEXT, - - -- 向量嵌入 (1536维适配OpenAI, 可调整为其他维度) - embedding vector(1536), - - -- 文档元数据 - doc_type VARCHAR(50), - language VARCHAR(10) DEFAULT 'zh', - file_type VARCHAR(50), - file_size BIGINT, - file_path VARCHAR(500), - - -- 时间信息 (用于时间维度检索) - doc_date DATE, - doc_datetime TIMESTAMP, - - -- 分块信息(大文档分块存储) - chunk_index INT DEFAULT 0, - chunk_total INT DEFAULT 1, - chunk_parent_id BIGINT, - - -- 标签和分类 - tags JSONB, - category VARCHAR(100), - - -- 使用统计 - view_count INT DEFAULT 0, - query_count INT DEFAULT 0, - last_queried_at TIMESTAMP, - - -- 状态 - status VARCHAR(20) DEFAULT 'active', - error_message TEXT, - - -- 创建信息 - create_by BIGINT, - create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - update_by BIGINT, - update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - deleted SMALLINT DEFAULT 0, - - -- 外键约束 - CONSTRAINT fk_ai_doc_project FOREIGN KEY (project_id) REFERENCES project(id) ON DELETE SET NULL, - CONSTRAINT fk_ai_doc_timeline FOREIGN KEY (timeline_node_id) REFERENCES project_timeline(id) ON DELETE SET NULL, - CONSTRAINT fk_ai_doc_kb FOREIGN KEY (kb_id) REFERENCES ai_knowledge_base(id) ON DELETE SET NULL -); - --- 创建向量索引 (使用IVFFlat或HNSW) --- IVFFlat: 适合中等数据量, 内存占用小 --- HNSW: 适合大数据量, 查询更快但内存占用大 -CREATE INDEX idx_ai_document_embedding ON ai_document - USING ivfflat (embedding vector_cosine_ops) - WITH (lists = 100); - --- 创建其他常用索引 -CREATE INDEX idx_ai_doc_project ON ai_document(project_id) WHERE deleted = 0; -CREATE INDEX idx_ai_doc_timeline ON ai_document(timeline_node_id) WHERE deleted = 0; -CREATE INDEX idx_ai_doc_kb ON ai_document(kb_id) WHERE deleted = 0; -CREATE INDEX idx_ai_doc_source ON ai_document(source_type, source_id) WHERE deleted = 0; -CREATE INDEX idx_ai_doc_status ON ai_document(status); -CREATE INDEX idx_ai_doc_type ON ai_document(doc_type); -CREATE INDEX idx_ai_doc_tags ON ai_document USING GIN(tags); - -COMMENT ON TABLE ai_document IS 'AI文档向量表 - 存储所有用于RAG的文档向量'; -COMMENT ON COLUMN ai_document.doc_id IS '文档唯一标识'; -COMMENT ON COLUMN ai_document.project_id IS '关联项目ID'; -COMMENT ON COLUMN ai_document.timeline_node_id IS '关联时间节点ID'; -COMMENT ON COLUMN ai_document.kb_id IS '关联知识库ID'; -COMMENT ON COLUMN ai_document.source_type IS '来源类型: project-项目文档, risk-风险文档, ticket-工单, report-日报, upload-上传文件, knowledge-知识库, chat-对话记录'; -COMMENT ON COLUMN ai_document.source_id IS '来源记录ID'; -COMMENT ON COLUMN ai_document.title IS '文档标题'; -COMMENT ON COLUMN ai_document.content IS '文档内容(纯文本)'; -COMMENT ON COLUMN ai_document.content_raw IS '原始内容(带格式)'; -COMMENT ON COLUMN ai_document.summary IS 'AI生成的摘要'; -COMMENT ON COLUMN ai_document.embedding IS '向量嵌入'; -COMMENT ON COLUMN ai_document.doc_type IS '文档类型: requirement-需求, design-设计, plan-计划, report-报告, contract-合同, photo-照片, other-其他'; -COMMENT ON COLUMN ai_document.language IS '语言: zh-中文, en-英文'; -COMMENT ON COLUMN ai_document.file_type IS '文件类型: pdf, doc, txt, md, jpg, png等'; -COMMENT ON COLUMN ai_document.file_size IS '文件大小(字节)'; -COMMENT ON COLUMN ai_document.file_path IS '文件存储路径'; -COMMENT ON COLUMN ai_document.doc_date IS '文档日期(如日报日期、照片拍摄日期)'; -COMMENT ON COLUMN ai_document.doc_datetime IS '文档时间戳'; -COMMENT ON COLUMN ai_document.chunk_index IS '分块序号'; -COMMENT ON COLUMN ai_document.chunk_total IS '总分块数'; -COMMENT ON COLUMN ai_document.chunk_parent_id IS '父文档ID(分块时使用)'; -COMMENT ON COLUMN ai_document.tags IS '标签数组'; -COMMENT ON COLUMN ai_document.category IS '分类'; -COMMENT ON COLUMN ai_document.view_count IS '查看次数'; -COMMENT ON COLUMN ai_document.query_count IS '被检索次数'; -COMMENT ON COLUMN ai_document.last_queried_at IS '最后被检索时间'; -COMMENT ON COLUMN ai_document.status IS '状态: active-可用, processing-处理中, error-错误, archived-归档'; -COMMENT ON COLUMN ai_document.error_message IS '错误信息'; - -- AI对话记录表 (合并会话管理功能,无需单独的session表) +-- 注:ai_document表已合并到vector_store表中 DROP TABLE IF EXISTS ai_chat_history CASCADE; CREATE TABLE ai_chat_history ( id BIGSERIAL PRIMARY KEY, diff --git a/src/main/java/cn/yinlihupo/common/config/SpringAiConfig.java b/src/main/java/cn/yinlihupo/common/config/SpringAiConfig.java index 61db172..68ff1b3 100644 --- a/src/main/java/cn/yinlihupo/common/config/SpringAiConfig.java +++ b/src/main/java/cn/yinlihupo/common/config/SpringAiConfig.java @@ -41,7 +41,7 @@ public class SpringAiConfig { return PgVectorStore.builder(jdbcTemplate, embeddingModel) .dimensions(1536) // 向量维度,与配置一致 .distanceType(PgVectorStore.PgDistanceType.COSINE_DISTANCE) - .initializeSchema(true) // 自动初始化schema + .initializeSchema(false) // 禁用自动初始化,使用SQL文件中已创建的表 .build(); } } diff --git a/src/main/java/cn/yinlihupo/domain/entity/AiDocument.java b/src/main/java/cn/yinlihupo/domain/entity/AiDocument.java index e2b25ea..d8bf6b5 100644 --- a/src/main/java/cn/yinlihupo/domain/entity/AiDocument.java +++ b/src/main/java/cn/yinlihupo/domain/entity/AiDocument.java @@ -9,20 +9,20 @@ import java.time.LocalDate; import java.time.LocalDateTime; /** - * AI文档向量实体 - * 对应 ai_document 表 + * 向量存储实体 + * 对应 vector_store 表 + * 兼容 Spring AI PgVectorStore 默认结构 */ @Data -@TableName("ai_document") +@TableName("vector_store") public class AiDocument { - @TableId(type = IdType.AUTO) - private Long id; - /** - * 文档唯一标识(UUID) + * 文档ID(字符串类型,兼容PgVectorStore) + * 使用标准UUID格式(带连字符) */ - private String docId; + @TableId(type = IdType.INPUT) + private String id; /** * 关联项目ID @@ -124,7 +124,7 @@ public class AiDocument { /** * 父文档ID(分块时使用) */ - private Long chunkParentId; + private String chunkParentId; /** * 标签数组(JSON) diff --git a/src/main/java/cn/yinlihupo/domain/vo/KbDocumentVO.java b/src/main/java/cn/yinlihupo/domain/vo/KbDocumentVO.java index 5f21392..1913d95 100644 --- a/src/main/java/cn/yinlihupo/domain/vo/KbDocumentVO.java +++ b/src/main/java/cn/yinlihupo/domain/vo/KbDocumentVO.java @@ -13,10 +13,10 @@ public class KbDocumentVO { /** * 文档ID */ - private Long id; + private String id; /** - * 文档UUID + * 文档UUID(与id相同) */ private String docId; diff --git a/src/main/java/cn/yinlihupo/domain/vo/ReferencedDocVO.java b/src/main/java/cn/yinlihupo/domain/vo/ReferencedDocVO.java index 9494cfc..001c372 100644 --- a/src/main/java/cn/yinlihupo/domain/vo/ReferencedDocVO.java +++ b/src/main/java/cn/yinlihupo/domain/vo/ReferencedDocVO.java @@ -11,10 +11,10 @@ public class ReferencedDocVO { /** * 文档ID */ - private Long id; + private String id; /** - * 文档UUID + * 文档UUID(与id相同) */ private String docId; diff --git a/src/main/java/cn/yinlihupo/mapper/AiDocumentMapper.java b/src/main/java/cn/yinlihupo/mapper/AiDocumentMapper.java index 9377160..ba7ef16 100644 --- a/src/main/java/cn/yinlihupo/mapper/AiDocumentMapper.java +++ b/src/main/java/cn/yinlihupo/mapper/AiDocumentMapper.java @@ -10,7 +10,7 @@ import org.apache.ibatis.annotations.Param; import java.util.List; /** - * AI文档向量Mapper + * 向量存储Mapper */ @Mapper public interface AiDocumentMapper extends BaseMapper { @@ -24,17 +24,17 @@ public interface AiDocumentMapper extends BaseMapper { List selectProjectDocuments(@Param("projectId") Long projectId); /** - * 根据docId查询文档 + * 根据id查询文档 * - * @param docId 文档UUID + * @param docId 文档ID * @return 文档实体 */ AiDocument selectByDocId(@Param("docId") String docId); /** - * 根据docId删除文档 + * 根据id删除文档 * - * @param docId 文档UUID + * @param docId 文档ID * @return 影响行数 */ int deleteByDocId(@Param("docId") String docId); @@ -45,7 +45,7 @@ public interface AiDocumentMapper extends BaseMapper { * @param docIds 文档ID列表 * @return 文档信息列表 */ - List selectReferencedDocs(@Param("docIds") List docIds); + List selectReferencedDocs(@Param("docIds") List docIds); /** * 获取父文档的分块数量 @@ -53,12 +53,12 @@ public interface AiDocumentMapper extends BaseMapper { * @param docId 父文档ID * @return 分块数量 */ - Integer selectChunkCount(@Param("docId") Long docId); + Integer selectChunkCount(@Param("docId") String docId); /** * 更新文档状态 * - * @param docId 文档UUID + * @param docId 文档ID * @param status 状态 * @return 影响行数 */ @@ -67,7 +67,7 @@ public interface AiDocumentMapper extends BaseMapper { /** * 更新文档错误信息 * - * @param docId 文档UUID + * @param docId 文档ID * @param errorMessage 错误信息 * @return 影响行数 */ @@ -79,7 +79,7 @@ public interface AiDocumentMapper extends BaseMapper { * @param id 文档ID * @return 影响行数 */ - int incrementViewCount(@Param("id") Long id); + int incrementViewCount(@Param("id") String id); /** * 增加文档查询次数 @@ -87,5 +87,5 @@ public interface AiDocumentMapper extends BaseMapper { * @param id 文档ID * @return 影响行数 */ - int incrementQueryCount(@Param("id") Long id); + int incrementQueryCount(@Param("id") String id); } diff --git a/src/main/java/cn/yinlihupo/service/ai/AiKnowledgeBaseService.java b/src/main/java/cn/yinlihupo/service/ai/AiKnowledgeBaseService.java index baff9e9..9fd48bf 100644 --- a/src/main/java/cn/yinlihupo/service/ai/AiKnowledgeBaseService.java +++ b/src/main/java/cn/yinlihupo/service/ai/AiKnowledgeBaseService.java @@ -49,12 +49,12 @@ public interface AiKnowledgeBaseService { * * @param docId 文档ID */ - void processDocument(Long docId); + void processDocument(String docId); /** * 异步处理文档 * * @param docId 文档ID */ - void processDocumentAsync(Long docId); + void processDocumentAsync(String docId); } diff --git a/src/main/java/cn/yinlihupo/service/ai/impl/AiKnowledgeBaseServiceImpl.java b/src/main/java/cn/yinlihupo/service/ai/impl/AiKnowledgeBaseServiceImpl.java index 97d25e1..97b140c 100644 --- a/src/main/java/cn/yinlihupo/service/ai/impl/AiKnowledgeBaseServiceImpl.java +++ b/src/main/java/cn/yinlihupo/service/ai/impl/AiKnowledgeBaseServiceImpl.java @@ -55,7 +55,7 @@ public class AiKnowledgeBaseServiceImpl implements AiKnowledgeBaseService { // 4. 保存文档元数据 AiDocument doc = new AiDocument(); - doc.setDocId(docId); + doc.setId(docId); // 设置标准UUID格式的ID doc.setProjectId(projectId); doc.setSourceType("upload"); doc.setTitle(originalFilename); @@ -75,7 +75,7 @@ public class AiKnowledgeBaseServiceImpl implements AiKnowledgeBaseService { // 5. 异步处理文档(解析、切片、向量化) documentProcessor.processDocumentAsync(doc.getId()); - log.info("文件上传成功: {}, docId: {}", originalFilename, docId); + log.info("文件上传成功: {}, docId: {}", originalFilename, doc.getId()); // 6. 返回VO return convertToVO(doc); @@ -133,13 +133,13 @@ public class AiKnowledgeBaseServiceImpl implements AiKnowledgeBaseService { } @Override - public void processDocument(Long docId) { + public void processDocument(String docId) { documentProcessor.processDocument(docId); } @Override @Async - public void processDocumentAsync(Long docId) { + public void processDocumentAsync(String docId) { documentProcessor.processDocument(docId); } @@ -197,7 +197,7 @@ public class AiKnowledgeBaseServiceImpl implements AiKnowledgeBaseService { private KbDocumentVO convertToVO(AiDocument doc) { KbDocumentVO vo = new KbDocumentVO(); vo.setId(doc.getId()); - vo.setDocId(doc.getDocId()); + vo.setDocId(doc.getId()); vo.setTitle(doc.getTitle()); vo.setDocType(doc.getDocType()); vo.setFileType(doc.getFileType()); diff --git a/src/main/java/cn/yinlihupo/service/ai/rag/DocumentProcessor.java b/src/main/java/cn/yinlihupo/service/ai/rag/DocumentProcessor.java index 84e0fc2..edb67f9 100644 --- a/src/main/java/cn/yinlihupo/service/ai/rag/DocumentProcessor.java +++ b/src/main/java/cn/yinlihupo/service/ai/rag/DocumentProcessor.java @@ -16,6 +16,7 @@ import java.io.InputStream; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.UUID; import java.util.stream.Collectors; /** @@ -41,7 +42,7 @@ public class DocumentProcessor { * * @param docId 文档ID */ - public void processDocument(Long docId) { + public void processDocument(String docId) { AiDocument doc = documentMapper.selectById(docId); if (doc == null) { log.error("文档不存在: {}", docId); @@ -91,7 +92,7 @@ public class DocumentProcessor { * @param docId 文档ID */ @Async("documentTaskExecutor") - public void processDocumentAsync(Long docId) { + public void processDocumentAsync(String docId) { processDocument(docId); } @@ -169,23 +170,25 @@ public class DocumentProcessor { * @param chunks 切片列表 */ private void storeChunks(AiDocument parentDoc, List chunks) { - String docId = parentDoc.getDocId(); - Long parentId = parentDoc.getId(); + String parentId = parentDoc.getId(); for (int i = 0; i < chunks.size(); i++) { String chunkContent = chunks.get(i); + // 使用UUID生成唯一的chunk ID,确保格式正确 + String chunkId = UUID.randomUUID().toString(); // 创建向量文档 Document vectorDoc = new Document( + chunkId, chunkContent, Map.of( - "doc_id", docId.toString(), - "project_id", parentDoc.getProjectId(), - "timeline_node_id", parentDoc.getTimelineNodeId() != null ? parentDoc.getTimelineNodeId() : "", + "project_id", parentDoc.getProjectId() != null ? parentDoc.getProjectId().toString() : "", + "timeline_node_id", parentDoc.getTimelineNodeId() != null ? parentDoc.getTimelineNodeId().toString() : "", "chunk_index", i, "chunk_total", chunks.size(), - "title", parentDoc.getTitle(), - "source_type", parentDoc.getSourceType(), + "chunk_parent_id", parentId, + "title", parentDoc.getTitle() != null ? parentDoc.getTitle() : "", + "source_type", parentDoc.getSourceType() != null ? parentDoc.getSourceType() : "", "status", "active" ) ); @@ -199,7 +202,7 @@ public class DocumentProcessor { documentMapper.updateById(parentDoc); } - log.debug("存储切片: {}/{}, docId: {}", i + 1, chunks.size(), docId); + log.debug("存储切片: {}/{}, parentId: {}, chunkId: {}", i + 1, chunks.size(), parentId, chunkId); } } diff --git a/src/main/resources/application-dev.yaml b/src/main/resources/application-dev.yaml index e57e47e..439c274 100644 --- a/src/main/resources/application-dev.yaml +++ b/src/main/resources/application-dev.yaml @@ -46,14 +46,21 @@ spring: ai: openai: - api-key: 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 - base-url: https://sg1.proxy.yinlihupo.cc/proxy/https://openrouter.ai/api + ##嵌入式模型 embedding: options: - model: qwen/qwen3-embedding-8b + model: text-embedding-v4 + base-url: https://dashscope.aliyuncs.com/compatible-mode + api-key: sk-85c3ccc7c63747c485f9699c90f1972f + ##聊天模型 chat: options: model: google/gemini-3.1-pro-preview + api-key: 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 + base-url: https://sg1.proxy.yinlihupo.cc/proxy/https://openrouter.ai/api + ##其他模型的apikey + base-url: https://sg1.proxy.yinlihupo.cc/proxy/https://openrouter.ai/api + api-key: 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 # MinIO 对象存储配置 minio: diff --git a/src/main/resources/mapper/AiDocumentMapper.xml b/src/main/resources/mapper/AiDocumentMapper.xml index 6330250..3f4d597 100644 --- a/src/main/resources/mapper/AiDocumentMapper.xml +++ b/src/main/resources/mapper/AiDocumentMapper.xml @@ -5,54 +5,52 @@ - + - + - UPDATE ai_document + UPDATE vector_store SET deleted = 1, update_time = NOW() - WHERE doc_id = #{docId} + WHERE id = #{docId} SELECT COUNT(*) - FROM ai_document + FROM vector_store WHERE chunk_parent_id = #{docId} AND deleted = 0 - UPDATE ai_document + UPDATE vector_store SET status = #{status}, update_time = NOW() - WHERE doc_id = #{docId} + WHERE id = #{docId} - UPDATE ai_document + UPDATE vector_store SET error_message = #{errorMessage}, status = 'error', update_time = NOW() - WHERE doc_id = #{docId} + WHERE id = #{docId} - UPDATE ai_document + UPDATE vector_store SET view_count = view_count + 1 WHERE id = #{id} - UPDATE ai_document + UPDATE vector_store SET query_count = query_count + 1, last_queried_at = NOW() WHERE id = #{id}