diff --git a/pom.xml b/pom.xml
index 61a4ec4..7eed89e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -81,11 +81,50 @@
8.5.7
-
+
+
- com.amazonaws
- aws-java-sdk-s3
- 1.12.700
+ org.apache.pdfbox
+ pdfbox
+ 2.0.30
+
+
+
+
+ org.apache.poi
+ poi-ooxml
+ 5.2.5
+
+
+ org.apache.poi
+ poi-scratchpad
+ 5.2.5
+
+
+
+
+ org.apache.poi
+ poi-ooxml-schemas
+ 4.1.2
+
+
+
+
+ com.vladsch.flexmark
+ flexmark-all
+ 0.64.8
+
+
+
+
+ org.apache.tika
+ tika-core
+ 2.9.1
+
+
+ org.apache.tika
+ tika-parsers-standard-package
+ 2.9.1
diff --git a/src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java b/src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java
new file mode 100644
index 0000000..84360f3
--- /dev/null
+++ b/src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java
@@ -0,0 +1,261 @@
+package cn.yinlihupo.common.util;
+
+import com.vladsch.flexmark.html.HtmlRenderer;
+import com.vladsch.flexmark.parser.Parser;
+import com.vladsch.flexmark.util.data.MutableDataSet;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.extractor.WordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.poi.ss.usermodel.*;
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+/**
+ * 文档解析工具类
+ * 支持 PDF、Word、Excel、Markdown 等格式的文档解析
+ */
+@Slf4j
+public class DocumentParserUtil {
+
+ private static final Tika tika = new Tika();
+
+ /**
+ * 自动检测文件类型并解析内容
+ *
+ * @param inputStream 文件输入流
+ * @param fileName 文件名
+ * @return 文档文本内容
+ */
+ public static String parse(InputStream inputStream, String fileName) {
+ try {
+ // 将输入流转换为支持 mark/reset 的 BufferedInputStream
+ BufferedInputStream bufferedStream = new BufferedInputStream(inputStream);
+ bufferedStream.mark(Integer.MAX_VALUE);
+
+ // 检测文件类型
+ String mimeType = tika.detect(bufferedStream, fileName);
+ log.info("检测到文件类型: {}, 文件名: {}", mimeType, fileName);
+
+ // 重置输入流到起始位置
+ bufferedStream.reset();
+
+ // 根据文件扩展名或MIME类型选择解析器
+ String lowerFileName = fileName.toLowerCase();
+
+ if (lowerFileName.endsWith(".pdf") || mimeType.equals("application/pdf")) {
+ return parsePdf(bufferedStream);
+ } else if (lowerFileName.endsWith(".docx") || mimeType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
+ return parseDocx(bufferedStream);
+ } else if (lowerFileName.endsWith(".doc") || mimeType.equals("application/msword")) {
+ return parseDoc(bufferedStream);
+ } else if (lowerFileName.endsWith(".xlsx") || mimeType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
+ return parseExcel(bufferedStream);
+ } else if (lowerFileName.endsWith(".xls") || mimeType.equals("application/vnd.ms-excel")) {
+ return parseExcel(bufferedStream);
+ } else if (lowerFileName.endsWith(".md") || lowerFileName.endsWith(".markdown") || mimeType.equals("text/markdown")) {
+ return parseMarkdown(bufferedStream);
+ } else if (lowerFileName.endsWith(".txt") || mimeType.startsWith("text/")) {
+ return parseText(bufferedStream);
+ } else {
+ // 使用 Tika 作为通用解析器
+ return tika.parseToString(bufferedStream);
+ }
+ } catch (Exception e) {
+ log.error("解析文档失败: {}", fileName, e);
+ throw new RuntimeException("解析文档失败: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * 解析 PDF 文件
+ *
+ * @param inputStream PDF文件输入流
+ * @return 文本内容
+ */
+ public static String parsePdf(InputStream inputStream) {
+ try (PDDocument document = PDDocument.load(inputStream)) {
+ PDFTextStripper stripper = new PDFTextStripper();
+ // 设置编码,解决中文乱码问题
+ stripper.setSortByPosition(true);
+ String text = stripper.getText(document);
+ log.info("PDF解析成功,共 {} 页", document.getNumberOfPages());
+ return text;
+ } catch (Exception e) {
+ log.error("PDF解析失败", e);
+ throw new RuntimeException("PDF解析失败: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * 解析 Word 2007+ 文件 (.docx)
+ *
+ * @param inputStream DOCX文件输入流
+ * @return 文本内容
+ */
+ public static String parseDocx(InputStream inputStream) {
+ try (XWPFDocument document = new XWPFDocument(inputStream)) {
+ StringBuilder text = new StringBuilder();
+ List paragraphs = document.getParagraphs();
+ for (XWPFParagraph paragraph : paragraphs) {
+ text.append(paragraph.getText()).append("\n");
+ }
+ log.info("DOCX解析成功,共 {} 段落", paragraphs.size());
+ return text.toString();
+ } catch (Exception e) {
+ log.error("DOCX解析失败", e);
+ throw new RuntimeException("DOCX解析失败: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * 解析 Word 97-2003 文件 (.doc)
+ *
+ * @param inputStream DOC文件输入流
+ * @return 文本内容
+ */
+ public static String parseDoc(InputStream inputStream) {
+ try (HWPFDocument document = new HWPFDocument(inputStream);
+ WordExtractor extractor = new WordExtractor(document)) {
+ String text = extractor.getText();
+ log.info("DOC解析成功");
+ return text;
+ } catch (Exception e) {
+ log.error("DOC解析失败", e);
+ throw new RuntimeException("DOC解析失败: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * 解析 Excel 文件 (.xlsx, .xls)
+ *
+ * @param inputStream Excel文件输入流
+ * @return 文本内容
+ */
+ public static String parseExcel(InputStream inputStream) {
+ try (Workbook workbook = WorkbookFactory.create(inputStream)) {
+ StringBuilder text = new StringBuilder();
+ int sheetCount = workbook.getNumberOfSheets();
+
+ for (int i = 0; i < sheetCount; i++) {
+ Sheet sheet = workbook.getSheetAt(i);
+ text.append("Sheet: ").append(sheet.getSheetName()).append("\n");
+
+ for (Row row : sheet) {
+ StringBuilder rowText = new StringBuilder();
+ for (Cell cell : row) {
+ String cellValue = getCellValueAsString(cell);
+ if (!cellValue.isEmpty()) {
+ rowText.append(cellValue).append("\t");
+ }
+ }
+ if (rowText.length() > 0) {
+ text.append(rowText.toString().trim()).append("\n");
+ }
+ }
+ text.append("\n");
+ }
+
+ log.info("Excel解析成功,共 {} 个Sheet", sheetCount);
+ return text.toString();
+ } catch (Exception e) {
+ log.error("Excel解析失败", e);
+ throw new RuntimeException("Excel解析失败: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * 获取单元格的字符串值
+ */
+ private static String getCellValueAsString(Cell cell) {
+ if (cell == null) {
+ return "";
+ }
+
+ switch (cell.getCellType()) {
+ case STRING:
+ return cell.getStringCellValue();
+ case NUMERIC:
+ if (DateUtil.isCellDateFormatted(cell)) {
+ return cell.getDateCellValue().toString();
+ }
+ return String.valueOf(cell.getNumericCellValue());
+ case BOOLEAN:
+ return String.valueOf(cell.getBooleanCellValue());
+ case FORMULA:
+ return cell.getCellFormula();
+ default:
+ return "";
+ }
+ }
+
+ /**
+ * 解析 Markdown 文件
+ *
+ * @param inputStream Markdown文件输入流
+ * @return 文本内容
+ */
+ public static String parseMarkdown(InputStream inputStream) {
+ try {
+ String content = new String(inputStream.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8);
+
+ // 配置 Flexmark
+ MutableDataSet options = new MutableDataSet();
+ Parser parser = Parser.builder(options).build();
+ HtmlRenderer renderer = HtmlRenderer.builder(options).build();
+
+ // 解析 Markdown 为 HTML(可选,如果需要纯文本可以直接返回原始内容)
+ com.vladsch.flexmark.util.ast.Node document = parser.parse(content);
+ String html = renderer.render(document);
+
+ log.info("Markdown解析成功");
+ // 返回原始 Markdown 文本,如需 HTML 可返回 html 变量
+ return content;
+ } catch (Exception e) {
+ log.error("Markdown解析失败", e);
+ throw new RuntimeException("Markdown解析失败: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * 解析纯文本文件
+ *
+ * @param inputStream 文本文件输入流
+ * @return 文本内容
+ */
+ public static String parseText(InputStream inputStream) {
+ try {
+ String content = new String(inputStream.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8);
+ log.info("文本文件解析成功");
+ return content;
+ } catch (Exception e) {
+ log.error("文本文件解析失败", e);
+ throw new RuntimeException("文本文件解析失败: " + e.getMessage(), e);
+ }
+ }
+
+ /**
+ * 检测文件类型
+ *
+ * @param inputStream 文件输入流
+ * @param fileName 文件名
+ * @return MIME类型
+ */
+ public static String detectMimeType(InputStream inputStream, String fileName) {
+ try {
+ return tika.detect(inputStream, fileName);
+ } catch (Exception e) {
+ log.error("检测文件类型失败", e);
+ return "application/octet-stream";
+ }
+ }
+}
diff --git a/src/main/java/cn/yinlihupo/controller/project/ProjectController.java b/src/main/java/cn/yinlihupo/controller/project/ProjectController.java
index 3090541..fa7e543 100644
--- a/src/main/java/cn/yinlihupo/controller/project/ProjectController.java
+++ b/src/main/java/cn/yinlihupo/controller/project/ProjectController.java
@@ -23,29 +23,6 @@ public class ProjectController {
private final ProjectService projectService;
private final OssService ossService;
- /**
- * 根据文本内容生成项目初始化数据
- *
- * @param request 包含项目资料内容的请求
- * @return 项目初始化结构化数据
- */
- @PostMapping("/from-content")
- public Result generateFromContent(@RequestBody ProjectInitRequest request) {
- log.info("收到项目初始化请求(文本内容)");
-
- if (request.getContent() == null || request.getContent().trim().isEmpty()) {
- return Result.error("项目资料内容不能为空");
- }
-
- try {
- ProjectInitResult result = projectService.generateProjectFromContent(request.getContent());
- return Result.success("项目初始化成功", result);
- } catch (Exception e) {
- log.error("项目初始化失败: {}", e.getMessage(), e);
- return Result.error("项目初始化失败: " + e.getMessage());
- }
- }
-
/**
* 上传文件并生成项目初始化数据
*
diff --git a/src/main/java/cn/yinlihupo/service/oss/impl/OssServiceImpl.java b/src/main/java/cn/yinlihupo/service/oss/impl/OssServiceImpl.java
index f8e25f8..8c36ef8 100644
--- a/src/main/java/cn/yinlihupo/service/oss/impl/OssServiceImpl.java
+++ b/src/main/java/cn/yinlihupo/service/oss/impl/OssServiceImpl.java
@@ -1,6 +1,7 @@
package cn.yinlihupo.service.oss.impl;
import cn.yinlihupo.common.config.MinioConfig;
+import cn.yinlihupo.common.util.DocumentParserUtil;
import cn.yinlihupo.service.oss.OssService;
import io.minio.*;
import io.minio.errors.*;
@@ -61,22 +62,43 @@ public class OssServiceImpl implements OssService {
@Override
public String readFileAsString(String fileUrl) {
- try (InputStream inputStream = getFileInputStream(fileUrl);
- ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
-
- byte[] buffer = new byte[1024];
- int bytesRead;
- while ((bytesRead = inputStream.read(buffer)) != -1) {
- outputStream.write(buffer, 0, bytesRead);
- }
-
- return outputStream.toString(StandardCharsets.UTF_8);
+ try (InputStream inputStream = getFileInputStream(fileUrl)) {
+ // 从 URL 中提取文件名
+ String fileName = extractFileNameFromUrl(fileUrl);
+ // 使用文档解析工具类解析文件内容
+ return DocumentParserUtil.parse(inputStream, fileName);
} catch (Exception e) {
log.error("读取文件内容失败: {}", e.getMessage(), e);
throw new RuntimeException("读取文件内容失败: " + e.getMessage(), e);
}
}
+ /**
+ * 从 URL 中提取文件名
+ *
+ * @param fileUrl 文件 URL
+ * @return 文件名
+ */
+ private String extractFileNameFromUrl(String fileUrl) {
+ try {
+ URL url = new URL(fileUrl);
+ String path = url.getPath();
+ // 去掉开头的 /
+ if (path.startsWith("/")) {
+ path = path.substring(1);
+ }
+ // 获取最后一个 / 后面的文件名
+ int lastSlashIndex = path.lastIndexOf('/');
+ if (lastSlashIndex >= 0) {
+ return path.substring(lastSlashIndex + 1);
+ }
+ return path;
+ } catch (Exception e) {
+ log.warn("从 URL 提取文件名失败: {}", fileUrl);
+ return "unknown";
+ }
+ }
+
@Override
public InputStream getFileInputStream(String fileUrl) {
try {
diff --git a/src/main/java/cn/yinlihupo/service/project/impl/ProjectServiceImpl.java b/src/main/java/cn/yinlihupo/service/project/impl/ProjectServiceImpl.java
index db31d05..e6b48aa 100644
--- a/src/main/java/cn/yinlihupo/service/project/impl/ProjectServiceImpl.java
+++ b/src/main/java/cn/yinlihupo/service/project/impl/ProjectServiceImpl.java
@@ -147,8 +147,10 @@ public class ProjectServiceImpl implements ProjectService {
public ProjectInitResult generateProjectFromContent(String content) {
log.info("开始根据内容生成项目初始化数据");
- PromptTemplate promptTemplate = new PromptTemplate(USER_PROMPT_TEMPLATE);
- String userPrompt = promptTemplate.createMessage(java.util.Map.of("content", content)).toString();
+ // 构建用户提示词,直接将内容嵌入
+ String userPrompt = "请根据以下项目资料,生成完整的项目初始化结构化数据:\n\n" +
+ content + "\n\n" +
+ "请严格按照系统提示词中的JSON格式输出,确保所有字段都包含合理的值。";
return chatClient.prompt()
.system(PROJECT_INIT_SYSTEM_PROMPT)
diff --git a/src/main/resources/application-dev.yaml b/src/main/resources/application-dev.yaml
index f396cb1..5e09858 100644
--- a/src/main/resources/application-dev.yaml
+++ b/src/main/resources/application-dev.yaml
@@ -37,12 +37,11 @@ spring:
base-url: https://sg1.proxy.yinlihupo.cc/proxy/https://openrouter.ai/api/v1
chat:
options:
- model: gpt-4o
- temperature: 0.3
+ model: Qwen3
# MinIO 对象存储配置
minio:
- endpoint: 10.200.8.25:9000
+ endpoint: http://10.200.8.25:9000
access-key: minioadmin
secret-key: minioadmin
bucket-name: ylhp-files