feat(project): 实现AI项目初始化及文档解析功能
- 新增DocumentParserUtil工具类,支持PDF、Word、Excel、Markdown及文本解析 - 基于MinIO实现OssService,支持文件上传、下载、删除及URL生成 - 添加ProjectService实现,利用Spring AI ChatClient解析项目文档生成结构化数据 - 新增ProjectController,提供文件上传接口供项目初始化调用 - 配置开发环境application-dev.yaml,包含数据库、MinIO及Spring AI相关配置 - 添加pom.xml,集成必要依赖如Spring AI、MinIO、Apache POI、PDFBox、Tika和Flexmark等组件
This commit is contained in:
47
pom.xml
47
pom.xml
@@ -81,11 +81,50 @@
|
||||
<version>8.5.7</version>
|
||||
</dependency>
|
||||
|
||||
<!-- aws-java 用于存储桶 -->
|
||||
<!-- 文档解析依赖 -->
|
||||
<!-- PDF解析 -->
|
||||
<dependency>
|
||||
<groupId>com.amazonaws</groupId>
|
||||
<artifactId>aws-java-sdk-s3</artifactId>
|
||||
<version>1.12.700</version>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>2.0.30</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Word文档解析 -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml</artifactId>
|
||||
<version>5.2.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-scratchpad</artifactId>
|
||||
<version>5.2.5</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Excel解析 -->
|
||||
<dependency>
|
||||
<groupId>org.apache.poi</groupId>
|
||||
<artifactId>poi-ooxml-schemas</artifactId>
|
||||
<version>4.1.2</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Markdown解析 -->
|
||||
<dependency>
|
||||
<groupId>com.vladsch.flexmark</groupId>
|
||||
<artifactId>flexmark-all</artifactId>
|
||||
<version>0.64.8</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Tika 用于通用文档类型检测和内容提取 -->
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-core</artifactId>
|
||||
<version>2.9.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-parsers-standard-package</artifactId>
|
||||
<version>2.9.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
||||
261
src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java
Normal file
261
src/main/java/cn/yinlihupo/common/util/DocumentParserUtil.java
Normal file
@@ -0,0 +1,261 @@
|
||||
package cn.yinlihupo.common.util;
|
||||
|
||||
import com.vladsch.flexmark.html.HtmlRenderer;
|
||||
import com.vladsch.flexmark.parser.Parser;
|
||||
import com.vladsch.flexmark.util.data.MutableDataSet;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.poi.hwpf.HWPFDocument;
|
||||
import org.apache.poi.hwpf.extractor.WordExtractor;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
||||
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
||||
import org.apache.poi.ss.usermodel.*;
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 文档解析工具类
|
||||
* 支持 PDF、Word、Excel、Markdown 等格式的文档解析
|
||||
*/
|
||||
@Slf4j
|
||||
public class DocumentParserUtil {
|
||||
|
||||
private static final Tika tika = new Tika();
|
||||
|
||||
/**
|
||||
* 自动检测文件类型并解析内容
|
||||
*
|
||||
* @param inputStream 文件输入流
|
||||
* @param fileName 文件名
|
||||
* @return 文档文本内容
|
||||
*/
|
||||
public static String parse(InputStream inputStream, String fileName) {
|
||||
try {
|
||||
// 将输入流转换为支持 mark/reset 的 BufferedInputStream
|
||||
BufferedInputStream bufferedStream = new BufferedInputStream(inputStream);
|
||||
bufferedStream.mark(Integer.MAX_VALUE);
|
||||
|
||||
// 检测文件类型
|
||||
String mimeType = tika.detect(bufferedStream, fileName);
|
||||
log.info("检测到文件类型: {}, 文件名: {}", mimeType, fileName);
|
||||
|
||||
// 重置输入流到起始位置
|
||||
bufferedStream.reset();
|
||||
|
||||
// 根据文件扩展名或MIME类型选择解析器
|
||||
String lowerFileName = fileName.toLowerCase();
|
||||
|
||||
if (lowerFileName.endsWith(".pdf") || mimeType.equals("application/pdf")) {
|
||||
return parsePdf(bufferedStream);
|
||||
} else if (lowerFileName.endsWith(".docx") || mimeType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
|
||||
return parseDocx(bufferedStream);
|
||||
} else if (lowerFileName.endsWith(".doc") || mimeType.equals("application/msword")) {
|
||||
return parseDoc(bufferedStream);
|
||||
} else if (lowerFileName.endsWith(".xlsx") || mimeType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
|
||||
return parseExcel(bufferedStream);
|
||||
} else if (lowerFileName.endsWith(".xls") || mimeType.equals("application/vnd.ms-excel")) {
|
||||
return parseExcel(bufferedStream);
|
||||
} else if (lowerFileName.endsWith(".md") || lowerFileName.endsWith(".markdown") || mimeType.equals("text/markdown")) {
|
||||
return parseMarkdown(bufferedStream);
|
||||
} else if (lowerFileName.endsWith(".txt") || mimeType.startsWith("text/")) {
|
||||
return parseText(bufferedStream);
|
||||
} else {
|
||||
// 使用 Tika 作为通用解析器
|
||||
return tika.parseToString(bufferedStream);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("解析文档失败: {}", fileName, e);
|
||||
throw new RuntimeException("解析文档失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 PDF 文件
|
||||
*
|
||||
* @param inputStream PDF文件输入流
|
||||
* @return 文本内容
|
||||
*/
|
||||
public static String parsePdf(InputStream inputStream) {
|
||||
try (PDDocument document = PDDocument.load(inputStream)) {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
// 设置编码,解决中文乱码问题
|
||||
stripper.setSortByPosition(true);
|
||||
String text = stripper.getText(document);
|
||||
log.info("PDF解析成功,共 {} 页", document.getNumberOfPages());
|
||||
return text;
|
||||
} catch (Exception e) {
|
||||
log.error("PDF解析失败", e);
|
||||
throw new RuntimeException("PDF解析失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 Word 2007+ 文件 (.docx)
|
||||
*
|
||||
* @param inputStream DOCX文件输入流
|
||||
* @return 文本内容
|
||||
*/
|
||||
public static String parseDocx(InputStream inputStream) {
|
||||
try (XWPFDocument document = new XWPFDocument(inputStream)) {
|
||||
StringBuilder text = new StringBuilder();
|
||||
List<XWPFParagraph> paragraphs = document.getParagraphs();
|
||||
for (XWPFParagraph paragraph : paragraphs) {
|
||||
text.append(paragraph.getText()).append("\n");
|
||||
}
|
||||
log.info("DOCX解析成功,共 {} 段落", paragraphs.size());
|
||||
return text.toString();
|
||||
} catch (Exception e) {
|
||||
log.error("DOCX解析失败", e);
|
||||
throw new RuntimeException("DOCX解析失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 Word 97-2003 文件 (.doc)
|
||||
*
|
||||
* @param inputStream DOC文件输入流
|
||||
* @return 文本内容
|
||||
*/
|
||||
public static String parseDoc(InputStream inputStream) {
|
||||
try (HWPFDocument document = new HWPFDocument(inputStream);
|
||||
WordExtractor extractor = new WordExtractor(document)) {
|
||||
String text = extractor.getText();
|
||||
log.info("DOC解析成功");
|
||||
return text;
|
||||
} catch (Exception e) {
|
||||
log.error("DOC解析失败", e);
|
||||
throw new RuntimeException("DOC解析失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 Excel 文件 (.xlsx, .xls)
|
||||
*
|
||||
* @param inputStream Excel文件输入流
|
||||
* @return 文本内容
|
||||
*/
|
||||
public static String parseExcel(InputStream inputStream) {
|
||||
try (Workbook workbook = WorkbookFactory.create(inputStream)) {
|
||||
StringBuilder text = new StringBuilder();
|
||||
int sheetCount = workbook.getNumberOfSheets();
|
||||
|
||||
for (int i = 0; i < sheetCount; i++) {
|
||||
Sheet sheet = workbook.getSheetAt(i);
|
||||
text.append("Sheet: ").append(sheet.getSheetName()).append("\n");
|
||||
|
||||
for (Row row : sheet) {
|
||||
StringBuilder rowText = new StringBuilder();
|
||||
for (Cell cell : row) {
|
||||
String cellValue = getCellValueAsString(cell);
|
||||
if (!cellValue.isEmpty()) {
|
||||
rowText.append(cellValue).append("\t");
|
||||
}
|
||||
}
|
||||
if (rowText.length() > 0) {
|
||||
text.append(rowText.toString().trim()).append("\n");
|
||||
}
|
||||
}
|
||||
text.append("\n");
|
||||
}
|
||||
|
||||
log.info("Excel解析成功,共 {} 个Sheet", sheetCount);
|
||||
return text.toString();
|
||||
} catch (Exception e) {
|
||||
log.error("Excel解析失败", e);
|
||||
throw new RuntimeException("Excel解析失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取单元格的字符串值
|
||||
*/
|
||||
private static String getCellValueAsString(Cell cell) {
|
||||
if (cell == null) {
|
||||
return "";
|
||||
}
|
||||
|
||||
switch (cell.getCellType()) {
|
||||
case STRING:
|
||||
return cell.getStringCellValue();
|
||||
case NUMERIC:
|
||||
if (DateUtil.isCellDateFormatted(cell)) {
|
||||
return cell.getDateCellValue().toString();
|
||||
}
|
||||
return String.valueOf(cell.getNumericCellValue());
|
||||
case BOOLEAN:
|
||||
return String.valueOf(cell.getBooleanCellValue());
|
||||
case FORMULA:
|
||||
return cell.getCellFormula();
|
||||
default:
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析 Markdown 文件
|
||||
*
|
||||
* @param inputStream Markdown文件输入流
|
||||
* @return 文本内容
|
||||
*/
|
||||
public static String parseMarkdown(InputStream inputStream) {
|
||||
try {
|
||||
String content = new String(inputStream.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8);
|
||||
|
||||
// 配置 Flexmark
|
||||
MutableDataSet options = new MutableDataSet();
|
||||
Parser parser = Parser.builder(options).build();
|
||||
HtmlRenderer renderer = HtmlRenderer.builder(options).build();
|
||||
|
||||
// 解析 Markdown 为 HTML(可选,如果需要纯文本可以直接返回原始内容)
|
||||
com.vladsch.flexmark.util.ast.Node document = parser.parse(content);
|
||||
String html = renderer.render(document);
|
||||
|
||||
log.info("Markdown解析成功");
|
||||
// 返回原始 Markdown 文本,如需 HTML 可返回 html 变量
|
||||
return content;
|
||||
} catch (Exception e) {
|
||||
log.error("Markdown解析失败", e);
|
||||
throw new RuntimeException("Markdown解析失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 解析纯文本文件
|
||||
*
|
||||
* @param inputStream 文本文件输入流
|
||||
* @return 文本内容
|
||||
*/
|
||||
public static String parseText(InputStream inputStream) {
|
||||
try {
|
||||
String content = new String(inputStream.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8);
|
||||
log.info("文本文件解析成功");
|
||||
return content;
|
||||
} catch (Exception e) {
|
||||
log.error("文本文件解析失败", e);
|
||||
throw new RuntimeException("文本文件解析失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 检测文件类型
|
||||
*
|
||||
* @param inputStream 文件输入流
|
||||
* @param fileName 文件名
|
||||
* @return MIME类型
|
||||
*/
|
||||
public static String detectMimeType(InputStream inputStream, String fileName) {
|
||||
try {
|
||||
return tika.detect(inputStream, fileName);
|
||||
} catch (Exception e) {
|
||||
log.error("检测文件类型失败", e);
|
||||
return "application/octet-stream";
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -23,29 +23,6 @@ public class ProjectController {
|
||||
private final ProjectService projectService;
|
||||
private final OssService ossService;
|
||||
|
||||
/**
|
||||
* 根据文本内容生成项目初始化数据
|
||||
*
|
||||
* @param request 包含项目资料内容的请求
|
||||
* @return 项目初始化结构化数据
|
||||
*/
|
||||
@PostMapping("/from-content")
|
||||
public Result<ProjectInitResult> generateFromContent(@RequestBody ProjectInitRequest request) {
|
||||
log.info("收到项目初始化请求(文本内容)");
|
||||
|
||||
if (request.getContent() == null || request.getContent().trim().isEmpty()) {
|
||||
return Result.error("项目资料内容不能为空");
|
||||
}
|
||||
|
||||
try {
|
||||
ProjectInitResult result = projectService.generateProjectFromContent(request.getContent());
|
||||
return Result.success("项目初始化成功", result);
|
||||
} catch (Exception e) {
|
||||
log.error("项目初始化失败: {}", e.getMessage(), e);
|
||||
return Result.error("项目初始化失败: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 上传文件并生成项目初始化数据
|
||||
*
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package cn.yinlihupo.service.oss.impl;
|
||||
|
||||
import cn.yinlihupo.common.config.MinioConfig;
|
||||
import cn.yinlihupo.common.util.DocumentParserUtil;
|
||||
import cn.yinlihupo.service.oss.OssService;
|
||||
import io.minio.*;
|
||||
import io.minio.errors.*;
|
||||
@@ -61,22 +62,43 @@ public class OssServiceImpl implements OssService {
|
||||
|
||||
@Override
|
||||
public String readFileAsString(String fileUrl) {
|
||||
try (InputStream inputStream = getFileInputStream(fileUrl);
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
|
||||
|
||||
byte[] buffer = new byte[1024];
|
||||
int bytesRead;
|
||||
while ((bytesRead = inputStream.read(buffer)) != -1) {
|
||||
outputStream.write(buffer, 0, bytesRead);
|
||||
}
|
||||
|
||||
return outputStream.toString(StandardCharsets.UTF_8);
|
||||
try (InputStream inputStream = getFileInputStream(fileUrl)) {
|
||||
// 从 URL 中提取文件名
|
||||
String fileName = extractFileNameFromUrl(fileUrl);
|
||||
// 使用文档解析工具类解析文件内容
|
||||
return DocumentParserUtil.parse(inputStream, fileName);
|
||||
} catch (Exception e) {
|
||||
log.error("读取文件内容失败: {}", e.getMessage(), e);
|
||||
throw new RuntimeException("读取文件内容失败: " + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 从 URL 中提取文件名
|
||||
*
|
||||
* @param fileUrl 文件 URL
|
||||
* @return 文件名
|
||||
*/
|
||||
private String extractFileNameFromUrl(String fileUrl) {
|
||||
try {
|
||||
URL url = new URL(fileUrl);
|
||||
String path = url.getPath();
|
||||
// 去掉开头的 /
|
||||
if (path.startsWith("/")) {
|
||||
path = path.substring(1);
|
||||
}
|
||||
// 获取最后一个 / 后面的文件名
|
||||
int lastSlashIndex = path.lastIndexOf('/');
|
||||
if (lastSlashIndex >= 0) {
|
||||
return path.substring(lastSlashIndex + 1);
|
||||
}
|
||||
return path;
|
||||
} catch (Exception e) {
|
||||
log.warn("从 URL 提取文件名失败: {}", fileUrl);
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getFileInputStream(String fileUrl) {
|
||||
try {
|
||||
|
||||
@@ -147,8 +147,10 @@ public class ProjectServiceImpl implements ProjectService {
|
||||
public ProjectInitResult generateProjectFromContent(String content) {
|
||||
log.info("开始根据内容生成项目初始化数据");
|
||||
|
||||
PromptTemplate promptTemplate = new PromptTemplate(USER_PROMPT_TEMPLATE);
|
||||
String userPrompt = promptTemplate.createMessage(java.util.Map.of("content", content)).toString();
|
||||
// 构建用户提示词,直接将内容嵌入
|
||||
String userPrompt = "请根据以下项目资料,生成完整的项目初始化结构化数据:\n\n" +
|
||||
content + "\n\n" +
|
||||
"请严格按照系统提示词中的JSON格式输出,确保所有字段都包含合理的值。";
|
||||
|
||||
return chatClient.prompt()
|
||||
.system(PROJECT_INIT_SYSTEM_PROMPT)
|
||||
|
||||
@@ -37,12 +37,11 @@ spring:
|
||||
base-url: https://sg1.proxy.yinlihupo.cc/proxy/https://openrouter.ai/api/v1
|
||||
chat:
|
||||
options:
|
||||
model: gpt-4o
|
||||
temperature: 0.3
|
||||
model: Qwen3
|
||||
|
||||
# MinIO 对象存储配置
|
||||
minio:
|
||||
endpoint: 10.200.8.25:9000
|
||||
endpoint: http://10.200.8.25:9000
|
||||
access-key: minioadmin
|
||||
secret-key: minioadmin
|
||||
bucket-name: ylhp-files
|
||||
|
||||
Reference in New Issue
Block a user