feat(project): 实现AI项目初始化及文档解析功能

- 新增DocumentParserUtil工具类,支持PDF、Word、Excel、Markdown及文本解析
- 基于MinIO实现OssService,支持文件上传、下载、删除及URL生成
- 添加ProjectService实现,利用Spring AI ChatClient解析项目文档生成结构化数据
- 新增ProjectController,提供文件上传接口供项目初始化调用
- 配置开发环境application-dev.yaml,包含数据库、MinIO及Spring AI相关配置
- 添加pom.xml,集成必要依赖如Spring AI、MinIO、Apache POI、PDFBox、Tika和Flexmark等组件
This commit is contained in:
2026-03-26 17:59:18 +08:00
parent 4656090683
commit 852cbd60a0
6 changed files with 342 additions and 42 deletions

47
pom.xml
View File

@@ -81,11 +81,50 @@
<version>8.5.7</version>
</dependency>
<!-- aws-java 用于存储桶 -->
<!-- 文档解析依赖 -->
<!-- PDF解析 -->
<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>1.12.700</version>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.30</version>
</dependency>
<!-- Word文档解析 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.5</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.5</version>
</dependency>
<!-- Excel解析 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.2</version>
</dependency>
<!-- Markdown解析 -->
<dependency>
<groupId>com.vladsch.flexmark</groupId>
<artifactId>flexmark-all</artifactId>
<version>0.64.8</version>
</dependency>
<!-- Tika 用于通用文档类型检测和内容提取 -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>2.9.1</version>
</dependency>
<dependency>

View File

@@ -0,0 +1,261 @@
package cn.yinlihupo.common.util;
import com.vladsch.flexmark.html.HtmlRenderer;
import com.vladsch.flexmark.parser.Parser;
import com.vladsch.flexmark.util.data.MutableDataSet;
import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.ss.usermodel.*;
import org.apache.tika.Tika;
import org.apache.tika.metadata.Metadata;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
/**
* 文档解析工具类
* 支持 PDF、Word、Excel、Markdown 等格式的文档解析
*/
@Slf4j
public class DocumentParserUtil {
private static final Tika tika = new Tika();
/**
* 自动检测文件类型并解析内容
*
* @param inputStream 文件输入流
* @param fileName 文件名
* @return 文档文本内容
*/
public static String parse(InputStream inputStream, String fileName) {
try {
// 将输入流转换为支持 mark/reset 的 BufferedInputStream
BufferedInputStream bufferedStream = new BufferedInputStream(inputStream);
bufferedStream.mark(Integer.MAX_VALUE);
// 检测文件类型
String mimeType = tika.detect(bufferedStream, fileName);
log.info("检测到文件类型: {}, 文件名: {}", mimeType, fileName);
// 重置输入流到起始位置
bufferedStream.reset();
// 根据文件扩展名或MIME类型选择解析器
String lowerFileName = fileName.toLowerCase();
if (lowerFileName.endsWith(".pdf") || mimeType.equals("application/pdf")) {
return parsePdf(bufferedStream);
} else if (lowerFileName.endsWith(".docx") || mimeType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
return parseDocx(bufferedStream);
} else if (lowerFileName.endsWith(".doc") || mimeType.equals("application/msword")) {
return parseDoc(bufferedStream);
} else if (lowerFileName.endsWith(".xlsx") || mimeType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
return parseExcel(bufferedStream);
} else if (lowerFileName.endsWith(".xls") || mimeType.equals("application/vnd.ms-excel")) {
return parseExcel(bufferedStream);
} else if (lowerFileName.endsWith(".md") || lowerFileName.endsWith(".markdown") || mimeType.equals("text/markdown")) {
return parseMarkdown(bufferedStream);
} else if (lowerFileName.endsWith(".txt") || mimeType.startsWith("text/")) {
return parseText(bufferedStream);
} else {
// 使用 Tika 作为通用解析器
return tika.parseToString(bufferedStream);
}
} catch (Exception e) {
log.error("解析文档失败: {}", fileName, e);
throw new RuntimeException("解析文档失败: " + e.getMessage(), e);
}
}
/**
* 解析 PDF 文件
*
* @param inputStream PDF文件输入流
* @return 文本内容
*/
public static String parsePdf(InputStream inputStream) {
try (PDDocument document = PDDocument.load(inputStream)) {
PDFTextStripper stripper = new PDFTextStripper();
// 设置编码,解决中文乱码问题
stripper.setSortByPosition(true);
String text = stripper.getText(document);
log.info("PDF解析成功共 {} 页", document.getNumberOfPages());
return text;
} catch (Exception e) {
log.error("PDF解析失败", e);
throw new RuntimeException("PDF解析失败: " + e.getMessage(), e);
}
}
/**
* 解析 Word 2007+ 文件 (.docx)
*
* @param inputStream DOCX文件输入流
* @return 文本内容
*/
public static String parseDocx(InputStream inputStream) {
try (XWPFDocument document = new XWPFDocument(inputStream)) {
StringBuilder text = new StringBuilder();
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
text.append(paragraph.getText()).append("\n");
}
log.info("DOCX解析成功共 {} 段落", paragraphs.size());
return text.toString();
} catch (Exception e) {
log.error("DOCX解析失败", e);
throw new RuntimeException("DOCX解析失败: " + e.getMessage(), e);
}
}
/**
* 解析 Word 97-2003 文件 (.doc)
*
* @param inputStream DOC文件输入流
* @return 文本内容
*/
public static String parseDoc(InputStream inputStream) {
try (HWPFDocument document = new HWPFDocument(inputStream);
WordExtractor extractor = new WordExtractor(document)) {
String text = extractor.getText();
log.info("DOC解析成功");
return text;
} catch (Exception e) {
log.error("DOC解析失败", e);
throw new RuntimeException("DOC解析失败: " + e.getMessage(), e);
}
}
/**
* 解析 Excel 文件 (.xlsx, .xls)
*
* @param inputStream Excel文件输入流
* @return 文本内容
*/
public static String parseExcel(InputStream inputStream) {
try (Workbook workbook = WorkbookFactory.create(inputStream)) {
StringBuilder text = new StringBuilder();
int sheetCount = workbook.getNumberOfSheets();
for (int i = 0; i < sheetCount; i++) {
Sheet sheet = workbook.getSheetAt(i);
text.append("Sheet: ").append(sheet.getSheetName()).append("\n");
for (Row row : sheet) {
StringBuilder rowText = new StringBuilder();
for (Cell cell : row) {
String cellValue = getCellValueAsString(cell);
if (!cellValue.isEmpty()) {
rowText.append(cellValue).append("\t");
}
}
if (rowText.length() > 0) {
text.append(rowText.toString().trim()).append("\n");
}
}
text.append("\n");
}
log.info("Excel解析成功共 {} 个Sheet", sheetCount);
return text.toString();
} catch (Exception e) {
log.error("Excel解析失败", e);
throw new RuntimeException("Excel解析失败: " + e.getMessage(), e);
}
}
/**
* 获取单元格的字符串值
*/
private static String getCellValueAsString(Cell cell) {
if (cell == null) {
return "";
}
switch (cell.getCellType()) {
case STRING:
return cell.getStringCellValue();
case NUMERIC:
if (DateUtil.isCellDateFormatted(cell)) {
return cell.getDateCellValue().toString();
}
return String.valueOf(cell.getNumericCellValue());
case BOOLEAN:
return String.valueOf(cell.getBooleanCellValue());
case FORMULA:
return cell.getCellFormula();
default:
return "";
}
}
/**
* 解析 Markdown 文件
*
* @param inputStream Markdown文件输入流
* @return 文本内容
*/
public static String parseMarkdown(InputStream inputStream) {
try {
String content = new String(inputStream.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8);
// 配置 Flexmark
MutableDataSet options = new MutableDataSet();
Parser parser = Parser.builder(options).build();
HtmlRenderer renderer = HtmlRenderer.builder(options).build();
// 解析 Markdown 为 HTML可选如果需要纯文本可以直接返回原始内容
com.vladsch.flexmark.util.ast.Node document = parser.parse(content);
String html = renderer.render(document);
log.info("Markdown解析成功");
// 返回原始 Markdown 文本,如需 HTML 可返回 html 变量
return content;
} catch (Exception e) {
log.error("Markdown解析失败", e);
throw new RuntimeException("Markdown解析失败: " + e.getMessage(), e);
}
}
/**
* 解析纯文本文件
*
* @param inputStream 文本文件输入流
* @return 文本内容
*/
public static String parseText(InputStream inputStream) {
try {
String content = new String(inputStream.readAllBytes(), java.nio.charset.StandardCharsets.UTF_8);
log.info("文本文件解析成功");
return content;
} catch (Exception e) {
log.error("文本文件解析失败", e);
throw new RuntimeException("文本文件解析失败: " + e.getMessage(), e);
}
}
/**
* 检测文件类型
*
* @param inputStream 文件输入流
* @param fileName 文件名
* @return MIME类型
*/
public static String detectMimeType(InputStream inputStream, String fileName) {
try {
return tika.detect(inputStream, fileName);
} catch (Exception e) {
log.error("检测文件类型失败", e);
return "application/octet-stream";
}
}
}

View File

@@ -23,29 +23,6 @@ public class ProjectController {
private final ProjectService projectService;
private final OssService ossService;
/**
* 根据文本内容生成项目初始化数据
*
* @param request 包含项目资料内容的请求
* @return 项目初始化结构化数据
*/
@PostMapping("/from-content")
public Result<ProjectInitResult> generateFromContent(@RequestBody ProjectInitRequest request) {
log.info("收到项目初始化请求(文本内容)");
if (request.getContent() == null || request.getContent().trim().isEmpty()) {
return Result.error("项目资料内容不能为空");
}
try {
ProjectInitResult result = projectService.generateProjectFromContent(request.getContent());
return Result.success("项目初始化成功", result);
} catch (Exception e) {
log.error("项目初始化失败: {}", e.getMessage(), e);
return Result.error("项目初始化失败: " + e.getMessage());
}
}
/**
* 上传文件并生成项目初始化数据
*

View File

@@ -1,6 +1,7 @@
package cn.yinlihupo.service.oss.impl;
import cn.yinlihupo.common.config.MinioConfig;
import cn.yinlihupo.common.util.DocumentParserUtil;
import cn.yinlihupo.service.oss.OssService;
import io.minio.*;
import io.minio.errors.*;
@@ -61,22 +62,43 @@ public class OssServiceImpl implements OssService {
@Override
public String readFileAsString(String fileUrl) {
try (InputStream inputStream = getFileInputStream(fileUrl);
ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
return outputStream.toString(StandardCharsets.UTF_8);
try (InputStream inputStream = getFileInputStream(fileUrl)) {
// 从 URL 中提取文件名
String fileName = extractFileNameFromUrl(fileUrl);
// 使用文档解析工具类解析文件内容
return DocumentParserUtil.parse(inputStream, fileName);
} catch (Exception e) {
log.error("读取文件内容失败: {}", e.getMessage(), e);
throw new RuntimeException("读取文件内容失败: " + e.getMessage(), e);
}
}
/**
* 从 URL 中提取文件名
*
* @param fileUrl 文件 URL
* @return 文件名
*/
private String extractFileNameFromUrl(String fileUrl) {
try {
URL url = new URL(fileUrl);
String path = url.getPath();
// 去掉开头的 /
if (path.startsWith("/")) {
path = path.substring(1);
}
// 获取最后一个 / 后面的文件名
int lastSlashIndex = path.lastIndexOf('/');
if (lastSlashIndex >= 0) {
return path.substring(lastSlashIndex + 1);
}
return path;
} catch (Exception e) {
log.warn("从 URL 提取文件名失败: {}", fileUrl);
return "unknown";
}
}
@Override
public InputStream getFileInputStream(String fileUrl) {
try {

View File

@@ -147,8 +147,10 @@ public class ProjectServiceImpl implements ProjectService {
public ProjectInitResult generateProjectFromContent(String content) {
log.info("开始根据内容生成项目初始化数据");
PromptTemplate promptTemplate = new PromptTemplate(USER_PROMPT_TEMPLATE);
String userPrompt = promptTemplate.createMessage(java.util.Map.of("content", content)).toString();
// 构建用户提示词,直接将内容嵌入
String userPrompt = "请根据以下项目资料,生成完整的项目初始化结构化数据:\n\n" +
content + "\n\n" +
"请严格按照系统提示词中的JSON格式输出确保所有字段都包含合理的值。";
return chatClient.prompt()
.system(PROJECT_INIT_SYSTEM_PROMPT)

View File

@@ -37,12 +37,11 @@ spring:
base-url: https://sg1.proxy.yinlihupo.cc/proxy/https://openrouter.ai/api/v1
chat:
options:
model: gpt-4o
temperature: 0.3
model: Qwen3
# MinIO 对象存储配置
minio:
endpoint: 10.200.8.25:9000
endpoint: http://10.200.8.25:9000
access-key: minioadmin
secret-key: minioadmin
bucket-name: ylhp-files