From 852cbd60a08bb7921e87d235c16fe9767ea6f071 Mon Sep 17 00:00:00 2001 From: JiaoTianBo Date: Thu, 26 Mar 2026 17:59:18 +0800 Subject: [PATCH] =?UTF-8?q?feat(project):=20=E5=AE=9E=E7=8E=B0AI=E9=A1=B9?= =?UTF-8?q?=E7=9B=AE=E5=88=9D=E5=A7=8B=E5=8C=96=E5=8F=8A=E6=96=87=E6=A1=A3?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增DocumentParserUtil工具类,支持PDF、Word、Excel、Markdown及文本解析 - 基于MinIO实现OssService,支持文件上传、下载、删除及URL生成 - 添加ProjectService实现,利用Spring AI ChatClient解析项目文档生成结构化数据 - 新增ProjectController,提供文件上传接口供项目初始化调用 - 配置开发环境application-dev.yaml,包含数据库、MinIO及Spring AI相关配置 - 添加pom.xml,集成必要依赖如Spring AI、MinIO、Apache POI、PDFBox、Tika和Flexmark等组件 --- pom.xml | 47 +++- .../common/util/DocumentParserUtil.java | 261 ++++++++++++++++++ .../controller/project/ProjectController.java | 23 -- .../service/oss/impl/OssServiceImpl.java | 42 ++- .../project/impl/ProjectServiceImpl.java | 6 +- src/main/resources/application-dev.yaml | 5 +- 6 files changed, 342 insertions(+), 42 deletions(-) create mode 100644 src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java diff --git a/pom.xml b/pom.xml index 61a4ec4..7eed89e 100644 --- a/pom.xml +++ b/pom.xml @@ -81,11 +81,50 @@ 8.5.7 - + + - com.amazonaws - aws-java-sdk-s3 - 1.12.700 + org.apache.pdfbox + pdfbox + 2.0.30 + + + + + org.apache.poi + poi-ooxml + 5.2.5 + + + org.apache.poi + poi-scratchpad + 5.2.5 + + + + + org.apache.poi + poi-ooxml-schemas + 4.1.2 + + + + + com.vladsch.flexmark + flexmark-all + 0.64.8 + + + + + org.apache.tika + tika-core + 2.9.1 + + + org.apache.tika + tika-parsers-standard-package + 2.9.1 diff --git a/src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java b/src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java new file mode 100644 index 0000000..84360f3 --- /dev/null +++ b/src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java @@ -0,0 +1,261 @@ +package cn.yinlihupo.common.util; + +import com.vladsch.flexmark.html.HtmlRenderer; +import com.vladsch.flexmark.parser.Parser; +import com.vladsch.flexmark.util.data.MutableDataSet; +import lombok.extern.slf4j.Slf4j; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.extractor.WordExtractor; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.apache.poi.ss.usermodel.*; +import org.apache.tika.Tika; +import org.apache.tika.metadata.Metadata; + +import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +/** + * 文档解析工具类 + * 支持 PDF、Word、Excel、Markdown 等格式的文档解析 + */ +@Slf4j +public class DocumentParserUtil { + + private static final Tika tika = new Tika(); + + /** + * 自动检测文件类型并解析内容 + * + * @param inputStream 文件输入流 + * @param fileName 文件名 + * @return 文档文本内容 + */ + public static String parse(InputStream inputStream, String fileName) { + try { + // 将输入流转换为支持 mark/reset 的 BufferedInputStream + BufferedInputStream bufferedStream = new BufferedInputStream(inputStream); + bufferedStream.mark(Integer.MAX_VALUE); + + // 检测文件类型 + String mimeType = tika.detect(bufferedStream, fileName); + log.info("检测到文件类型: {}, 文件名: {}", mimeType, fileName); + + // 重置输入流到起始位置 + bufferedStream.reset(); + + // 根据文件扩展名或MIME类型选择解析器 + String lowerFileName = fileName.toLowerCase(); + + if (lowerFileName.endsWith(".pdf") || mimeType.equals("application/pdf")) { + return parsePdf(bufferedStream); + } else if (lowerFileName.endsWith(".docx") || mimeType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) { + return parseDocx(bufferedStream); + } else if (lowerFileName.endsWith(".doc") || mimeType.equals("application/msword")) { + return parseDoc(bufferedStream); + } else if (lowerFileName.endsWith(".xlsx") || mimeType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) { + return parseExcel(bufferedStream); + } else if (lowerFileName.endsWith(".xls") || mimeType.equals("application/vnd.ms-excel")) { + return parseExcel(bufferedStream); + } else if (lowerFileName.endsWith(".md") || lowerFileName.endsWith(".markdown") || mimeType.equals("text/markdown")) { + return parseMarkdown(bufferedStream); + } else if (lowerFileName.endsWith(".txt") || mimeType.startsWith("text/")) { + return parseText(bufferedStream); + } else { + // 使用 Tika 作为通用解析器 + return tika.parseToString(bufferedStream); + } + } catch (Exception e) { + log.error("解析文档失败: {}", fileName, e); + throw new RuntimeException("解析文档失败: " + e.getMessage(), e); + } + } + + /** + * 解析 PDF 文件 + * + * @param inputStream PDF文件输入流 + * @return 文本内容 + */ + public static String parsePdf(InputStream inputStream) { + try (PDDocument document = PDDocument.load(inputStream)) { + PDFTextStripper stripper = new PDFTextStripper(); + // 设置编码,解决中文乱码问题 + stripper.setSortByPosition(true); + String text = stripper.getText(document); + log.info("PDF解析成功,共 {} 页", document.getNumberOfPages()); + return text; + } catch (Exception e) { + log.error("PDF解析失败", e); + throw new RuntimeException("PDF解析失败: " + e.getMessage(), e); + } + } + + /** + * 解析 Word 2007+ 文件 (.docx) + * + * @param inputStream DOCX文件输入流 + * @return 文本内容 + */ + public static String parseDocx(InputStream inputStream) { + try (XWPFDocument document = new XWPFDocument(inputStream)) { + StringBuilder text = new StringBuilder(); + List paragraphs = document.getParagraphs(); + for (XWPFParagraph paragraph : paragraphs) { + text.append(paragraph.getText()).append("\n"); + } + log.info("DOCX解析成功,共 {} 段落", paragraphs.size()); + return text.toString(); + } catch (Exception e) { + log.error("DOCX解析失败", e); + throw new RuntimeException("DOCX解析失败: " + e.getMessage(), e); + } + } + + /** + * 解析 Word 97-2003 文件 (.doc) + * + * @param inputStream DOC文件输入流 + * @return 文本内容 + */ + public static String parseDoc(InputStream inputStream) { + try (HWPFDocument document = new HWPFDocument(inputStream); + WordExtractor extractor = new WordExtractor(document)) { + String text = extractor.getText(); + log.info("DOC解析成功"); + return text; + } catch (Exception e) { + log.error("DOC解析失败", e); + throw new RuntimeException("DOC解析失败: " + e.getMessage(), e); + } + } + + /** + * 解析 Excel 文件 (.xlsx, .xls) + * + * @param inputStream Excel文件输入流 + * @return 文本内容 + */ + public static String parseExcel(InputStream inputStream) { + try (Workbook workbook = WorkbookFactory.create(inputStream)) { + StringBuilder text = new StringBuilder(); + int sheetCount = workbook.getNumberOfSheets(); + + for (int i = 0; i < sheetCount; i++) { + Sheet sheet = workbook.getSheetAt(i); + text.append("Sheet: ").append(sheet.getSheetName()).append("\n"); + + for (Row row : sheet) { + StringBuilder rowText = new StringBuilder(); + for (Cell cell : row) { + String cellValue = getCellValueAsString(cell); + if (!cellValue.isEmpty()) { + rowText.append(cellValue).append("\t"); + } + } + if (rowText.length() > 0) { + text.append(rowText.toString().trim()).append("\n"); + } + } + text.append("\n"); + } + + log.info("Excel解析成功,共 {} 个Sheet", sheetCount); + return text.toString(); + } catch (Exception e) { + log.error("Excel解析失败", e); + throw new RuntimeException("Excel解析失败: " + e.getMessage(), e); + } + } + + /** + * 获取单元格的字符串值 + */ + private static String getCellValueAsString(Cell cell) { + if (cell == null) { + return ""; + } + + switch (cell.getCellType()) { + case STRING: + return cell.getStringCellValue(); + case NUMERIC: + if (DateUtil.isCellDateFormatted(cell)) { + return cell.getDateCellValue().toString(); + } + return String.valueOf(cell.getNumericCellValue()); + case BOOLEAN: + return String.valueOf(cell.getBooleanCellValue()); + case FORMULA: + return cell.getCellFormula(); + default: + return ""; + } + } + + /** + * 解析 Markdown 文件 + * + * @param inputStream Markdown文件输入流 + * @return 文本内容 + */ + public static String parseMarkdown(InputStream inputStream) { + try { + String content = new String(inputStream.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8); + + // 配置 Flexmark + MutableDataSet options = new MutableDataSet(); + Parser parser = Parser.builder(options).build(); + HtmlRenderer renderer = HtmlRenderer.builder(options).build(); + + // 解析 Markdown 为 HTML(可选,如果需要纯文本可以直接返回原始内容) + com.vladsch.flexmark.util.ast.Node document = parser.parse(content); + String html = renderer.render(document); + + log.info("Markdown解析成功"); + // 返回原始 Markdown 文本,如需 HTML 可返回 html 变量 + return content; + } catch (Exception e) { + log.error("Markdown解析失败", e); + throw new RuntimeException("Markdown解析失败: " + e.getMessage(), e); + } + } + + /** + * 解析纯文本文件 + * + * @param inputStream 文本文件输入流 + * @return 文本内容 + */ + public static String parseText(InputStream inputStream) { + try { + String content = new String(inputStream.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8); + log.info("文本文件解析成功"); + return content; + } catch (Exception e) { + log.error("文本文件解析失败", e); + throw new RuntimeException("文本文件解析失败: " + e.getMessage(), e); + } + } + + /** + * 检测文件类型 + * + * @param inputStream 文件输入流 + * @param fileName 文件名 + * @return MIME类型 + */ + public static String detectMimeType(InputStream inputStream, String fileName) { + try { + return tika.detect(inputStream, fileName); + } catch (Exception e) { + log.error("检测文件类型失败", e); + return "application/octet-stream"; + } + } +} diff --git a/src/main/java/cn/yinlihupo/controller/project/ProjectController.java b/src/main/java/cn/yinlihupo/controller/project/ProjectController.java index 3090541..fa7e543 100644 --- a/src/main/java/cn/yinlihupo/controller/project/ProjectController.java +++ b/src/main/java/cn/yinlihupo/controller/project/ProjectController.java @@ -23,29 +23,6 @@ public class ProjectController { private final ProjectService projectService; private final OssService ossService; - /** - * 根据文本内容生成项目初始化数据 - * - * @param request 包含项目资料内容的请求 - * @return 项目初始化结构化数据 - */ - @PostMapping("/from-content") - public Result generateFromContent(@RequestBody ProjectInitRequest request) { - log.info("收到项目初始化请求(文本内容)"); - - if (request.getContent() == null || request.getContent().trim().isEmpty()) { - return Result.error("项目资料内容不能为空"); - } - - try { - ProjectInitResult result = projectService.generateProjectFromContent(request.getContent()); - return Result.success("项目初始化成功", result); - } catch (Exception e) { - log.error("项目初始化失败: {}", e.getMessage(), e); - return Result.error("项目初始化失败: " + e.getMessage()); - } - } - /** * 上传文件并生成项目初始化数据 * diff --git a/src/main/java/cn/yinlihupo/service/oss/impl/OssServiceImpl.java b/src/main/java/cn/yinlihupo/service/oss/impl/OssServiceImpl.java index f8e25f8..8c36ef8 100644 --- a/src/main/java/cn/yinlihupo/service/oss/impl/OssServiceImpl.java +++ b/src/main/java/cn/yinlihupo/service/oss/impl/OssServiceImpl.java @@ -1,6 +1,7 @@ package cn.yinlihupo.service.oss.impl; import cn.yinlihupo.common.config.MinioConfig; +import cn.yinlihupo.common.util.DocumentParserUtil; import cn.yinlihupo.service.oss.OssService; import io.minio.*; import io.minio.errors.*; @@ -61,22 +62,43 @@ public class OssServiceImpl implements OssService { @Override public String readFileAsString(String fileUrl) { - try (InputStream inputStream = getFileInputStream(fileUrl); - ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { - - byte[] buffer = new byte[1024]; - int bytesRead; - while ((bytesRead = inputStream.read(buffer)) != -1) { - outputStream.write(buffer, 0, bytesRead); - } - - return outputStream.toString(StandardCharsets.UTF_8); + try (InputStream inputStream = getFileInputStream(fileUrl)) { + // 从 URL 中提取文件名 + String fileName = extractFileNameFromUrl(fileUrl); + // 使用文档解析工具类解析文件内容 + return DocumentParserUtil.parse(inputStream, fileName); } catch (Exception e) { log.error("读取文件内容失败: {}", e.getMessage(), e); throw new RuntimeException("读取文件内容失败: " + e.getMessage(), e); } } + /** + * 从 URL 中提取文件名 + * + * @param fileUrl 文件 URL + * @return 文件名 + */ + private String extractFileNameFromUrl(String fileUrl) { + try { + URL url = new URL(fileUrl); + String path = url.getPath(); + // 去掉开头的 / + if (path.startsWith("/")) { + path = path.substring(1); + } + // 获取最后一个 / 后面的文件名 + int lastSlashIndex = path.lastIndexOf('/'); + if (lastSlashIndex >= 0) { + return path.substring(lastSlashIndex + 1); + } + return path; + } catch (Exception e) { + log.warn("从 URL 提取文件名失败: {}", fileUrl); + return "unknown"; + } + } + @Override public InputStream getFileInputStream(String fileUrl) { try { diff --git a/src/main/java/cn/yinlihupo/service/project/impl/ProjectServiceImpl.java b/src/main/java/cn/yinlihupo/service/project/impl/ProjectServiceImpl.java index db31d05..e6b48aa 100644 --- a/src/main/java/cn/yinlihupo/service/project/impl/ProjectServiceImpl.java +++ b/src/main/java/cn/yinlihupo/service/project/impl/ProjectServiceImpl.java @@ -147,8 +147,10 @@ public class ProjectServiceImpl implements ProjectService { public ProjectInitResult generateProjectFromContent(String content) { log.info("开始根据内容生成项目初始化数据"); - PromptTemplate promptTemplate = new PromptTemplate(USER_PROMPT_TEMPLATE); - String userPrompt = promptTemplate.createMessage(java.util.Map.of("content", content)).toString(); + // 构建用户提示词,直接将内容嵌入 + String userPrompt = "请根据以下项目资料,生成完整的项目初始化结构化数据:\n\n" + + content + "\n\n" + + "请严格按照系统提示词中的JSON格式输出,确保所有字段都包含合理的值。"; return chatClient.prompt() .system(PROJECT_INIT_SYSTEM_PROMPT) diff --git a/src/main/resources/application-dev.yaml b/src/main/resources/application-dev.yaml index f396cb1..5e09858 100644 --- a/src/main/resources/application-dev.yaml +++ b/src/main/resources/application-dev.yaml @@ -37,12 +37,11 @@ spring: base-url: https://sg1.proxy.yinlihupo.cc/proxy/https://openrouter.ai/api/v1 chat: options: - model: gpt-4o - temperature: 0.3 + model: Qwen3 # MinIO 对象存储配置 minio: - endpoint: 10.200.8.25:9000 + endpoint: http://10.200.8.25:9000 access-key: minioadmin secret-key: minioadmin bucket-name: ylhp-files