package tech.aiflowy.ai.controller;
|
|
import com.agentsflex.core.document.DocumentSplitter;
|
import com.agentsflex.core.document.splitter.RegexDocumentSplitter;
|
import com.agentsflex.core.document.splitter.SimpleTokenizeSplitter;
|
import com.agentsflex.core.llm.embedding.EmbeddingOptions;
|
import org.springframework.core.io.ClassPathResource;
|
import tech.aiflowy.ai.entity.AiDocument;
|
import tech.aiflowy.ai.entity.AiDocumentChunk;
|
import tech.aiflowy.ai.entity.AiKnowledge;
|
import tech.aiflowy.ai.entity.AiLlm;
|
import tech.aiflowy.ai.service.*;
|
import tech.aiflowy.ai.service.impl.AiDocumentServiceImpl;
|
import tech.aiflowy.common.ai.DocumentParserFactory;
|
import tech.aiflowy.common.ai.ExcelDocumentSplitter;
|
import tech.aiflowy.common.domain.Result;
|
import tech.aiflowy.common.tree.Tree;
|
import tech.aiflowy.common.util.RequestUtil;
|
import tech.aiflowy.common.util.StringUtil;
|
import tech.aiflowy.common.web.controller.BaseCurdController;
|
import tech.aiflowy.common.web.jsonbody.JsonBody;
|
import tech.aiflowy.core.utils.JudgeFileTypeUtil;
|
import tech.aiflowy.common.filestorage.FileStorageService;
|
import cn.dev33.satoken.stp.StpUtil;
|
import com.agentsflex.core.document.Document;
|
import com.agentsflex.core.document.DocumentParser;
|
import com.agentsflex.core.document.splitter.SimpleDocumentSplitter;
|
import com.agentsflex.core.llm.Llm;
|
import com.agentsflex.core.store.DocumentStore;
|
import com.agentsflex.core.store.StoreOptions;
|
import com.agentsflex.core.store.StoreResult;
|
import com.mybatisflex.core.paginate.Page;
|
import com.mybatisflex.core.query.QueryWrapper;
|
import org.apache.commons.lang.StringUtils;
|
import org.slf4j.LoggerFactory;
|
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Value;
|
import org.springframework.http.MediaType;
|
import org.springframework.transaction.annotation.Transactional;
|
import org.springframework.web.bind.annotation.*;
|
import org.springframework.web.multipart.MultipartFile;
|
|
import javax.annotation.Resource;
|
import java.io.File;
|
import java.io.IOException;
|
import java.io.InputStream;
|
import java.io.Serializable;
|
import java.math.BigInteger;
|
import java.util.*;
|
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.stream.Collectors;
|
|
/**
|
* 控制层。
|
*
|
* @author michael
|
* @since 2024-08-23
|
*/
|
@RestController
|
@RequestMapping("/api/v1/aiDocument")
|
public class AiDocumentController extends BaseCurdController<AiDocumentService, AiDocument> {
|
|
private final AiKnowledgeService knowledgeService;
|
private final AiDocumentChunkService documentChunkService;
|
private final AiLlmService aiLlmService;
|
|
@Autowired
|
private AiDocumentService aiDocumentService;
|
|
@Resource(name = "default")
|
FileStorageService storageService;
|
|
@Value("${aiflowy.storage.local.root}")
|
private String fileUploadPath;
|
|
public AiDocumentController(AiDocumentService service,
|
AiKnowledgeService knowledgeService,
|
AiDocumentChunkService documentChunkService, AiLlmService aiLlmService) {
|
super(service);
|
this.knowledgeService = knowledgeService;
|
this.documentChunkService = documentChunkService;
|
this.aiLlmService = aiLlmService;
|
}
|
@PostMapping("removeDoc")
|
@Transactional
|
public Result remove(@JsonBody(value = "id", required = true) String id) {
|
List<Serializable> ids = Collections.singletonList(id);
|
Result result = onRemoveBefore(ids);
|
if (result != null) return result;
|
boolean isSuccess = aiDocumentService.removeDoc(id);
|
if (!isSuccess){
|
return Result.fail(1,"删除失败");
|
}
|
boolean success = service.removeById(id);
|
onRemoveAfter(ids);
|
return Result.create(success);
|
}
|
|
/**
|
*
|
* @param documentId 文档id
|
* @return
|
* @throws IOException
|
*/
|
@PostMapping("docPreview")
|
public Result previewFile(@JsonBody(value = "documentId", required = true) String documentId) throws IOException {
|
|
return Result.success(aiDocumentService.previewFile(documentId));
|
}
|
|
|
/**
|
* 查询所有所有数据
|
*
|
* @param entity
|
* @param asTree
|
* @param sortKey
|
* @param sortType
|
* @return 所有数据
|
*/
|
@GetMapping("list")
|
@Override
|
public Result list(AiDocument entity, Boolean asTree, String sortKey, String sortType) {
|
String kbSlug = RequestUtil.getParamAsString("id");
|
if (StringUtil.noText(kbSlug)) {
|
return Result.fail(1, "知识库id不能为空");
|
}
|
|
AiKnowledge knowledge = StringUtil.isNumeric(kbSlug)
|
? knowledgeService.getById(kbSlug)
|
: knowledgeService.getOne(QueryWrapper.create().eq(AiKnowledge::getSlug, kbSlug));
|
|
if (knowledge == null) {
|
return Result.fail(2, "知识库不存在");
|
}
|
|
QueryWrapper queryWrapper = QueryWrapper.create()
|
.eq(AiDocument::getKnowledgeId, knowledge.getId());
|
queryWrapper.orderBy(buildOrderBy(sortKey, sortType, getDefaultOrderBy()));
|
List<AiDocument> aiDocuments = service.list(queryWrapper);
|
List<AiDocument> list = Tree.tryToTree(aiDocuments, asTree);
|
return Result.success(list);
|
}
|
|
@GetMapping("documentList")
|
public Result documentList(@RequestParam(name="fileName", required = false) String fileName, @RequestParam(name="pageSize") int pageSize, @RequestParam(name = "current") int current) {
|
String kbSlug = RequestUtil.getParamAsString("id");
|
if (StringUtil.noText(kbSlug)) {
|
return Result.fail(1, "知识库id不能为空");
|
}
|
Page<AiDocument> documentList = aiDocumentService.getDocumentList(kbSlug, pageSize, current,fileName);
|
return Result.success(documentList);
|
}
|
|
|
@Override
|
protected String getDefaultOrderBy() {
|
return "order_no asc";
|
}
|
|
|
@PostMapping("update")
|
@Override
|
public Result update(@JsonBody AiDocument entity) {
|
super.update(entity);
|
return updatePosition(entity);
|
}
|
|
/**
|
*
|
* @param file 上传的文件
|
* @param knowledgeId 知识库id
|
* @param chunkSize 分段大小
|
* @param overlapSize 分段重叠长度
|
* @param userWillSave 用户的操作是否要保存当前上传的文件 true 保存 false 不保存, 用户只预览上传文件后分割的效果
|
* @return
|
* @throws IOException
|
*/
|
@Transactional
|
@PostMapping(value = "upload", produces = MediaType.APPLICATION_JSON_VALUE)
|
public Result upload(@RequestParam("file") MultipartFile file, @RequestParam("knowledgeId") BigInteger knowledgeId,
|
@RequestParam(name="splitterName", required = false) String splitterName,
|
@RequestParam(name="chunkSize", required = false) Integer chunkSize,
|
@RequestParam(name="overlapSize", required = false) Integer overlapSize,
|
@RequestParam(name="regex", required = false) String regex,
|
@RequestParam(name="rowsPerChunk", required = false) Integer rowsPerChunk,
|
@RequestParam(name="userWillSave") boolean userWillSave
|
) throws IOException {
|
if (chunkSize == null){
|
chunkSize = 100;
|
}
|
if (overlapSize == null){
|
overlapSize = 200;
|
}
|
if (rowsPerChunk == null){
|
rowsPerChunk = 1;
|
}
|
if (file.getOriginalFilename() == null){
|
return Result.fail(1,"文件名不能为空");
|
}
|
String fileTypeByExtension = JudgeFileTypeUtil.getFileTypeByExtension(file.getOriginalFilename());
|
if (StringUtils.isEmpty(fileTypeByExtension)){
|
return Result.fail(2,"不支持的文档类型");
|
}
|
DocumentParser documentParser = DocumentParserFactory.getDocumentParser(file.getOriginalFilename());
|
if (documentParser == null) {
|
return Result.fail(3, "can not support the file type: " + file.getOriginalFilename());
|
}
|
String path = storageService.save(file);
|
AiDocument aiDocument = new AiDocument();
|
try (InputStream stream = storageService.readStream(path);) {
|
Document document = documentParser.parse(stream);
|
aiDocument.setContent(document.getContent());
|
} catch (IOException e) {
|
throw new RuntimeException(e);
|
}
|
|
//如果用户是预览分割效果
|
if (!userWillSave){
|
List<AiDocumentChunk> previewList = new ArrayList<>();
|
// 设置分割器 todo 未来可以通过参数来指定分割器,不同的文档使用不同的分割器效果更好
|
DocumentSplitter documentSplitter = getDocumentSplitter(splitterName, chunkSize, overlapSize, regex, rowsPerChunk);
|
Document document = Document.of(aiDocument.getContent());
|
List<Document> documents = documentSplitter.split(document);
|
int sort = 1;
|
for (Document value : documents) {
|
AiDocumentChunk chunk = new AiDocumentChunk();
|
chunk.setContent(value.getContent());
|
chunk.setSorting(sort);
|
sort++;
|
previewList.add(chunk);
|
}
|
// 删除本地文件
|
AiDocumentServiceImpl.deleteFile(getRootPath() + path);
|
Map<String, Object> res = new HashMap<>();
|
res.put("data", previewList);
|
res.put("userWillSave", false);
|
// 返回分割效果给用户
|
return Result.success(res);
|
}
|
|
|
aiDocument.setDocumentType(fileTypeByExtension);
|
aiDocument.setKnowledgeId(knowledgeId);
|
aiDocument.setDocumentPath(path);
|
aiDocument.setCreated(new Date());
|
aiDocument.setModifiedBy(BigInteger.valueOf(StpUtil.getLoginIdAsLong()));
|
aiDocument.setModified(new Date());
|
|
if (chunkSize != null && chunkSize != 0){
|
aiDocument.setChunkSize(chunkSize);
|
} else {
|
aiDocument.setChunkSize(200);
|
}
|
if (overlapSize != null && overlapSize != 0){
|
aiDocument.setOverlapSize(overlapSize);
|
} else {
|
aiDocument.setOverlapSize(100);
|
}
|
aiDocument.setTitle(StringUtil.removeFileExtension(file.getOriginalFilename()));
|
|
super.save(aiDocument);
|
return storeDocument(aiDocument, splitterName, chunkSize, overlapSize, regex, rowsPerChunk);
|
}
|
|
|
/**
|
* 更新 entity
|
*
|
* @param entity
|
* @return Result
|
*/
|
private Result updatePosition(AiDocument entity) {
|
Integer orderNo = entity.getOrderNo();
|
if (orderNo != null) {
|
if (orderNo <= 0) orderNo = 0;
|
BigInteger knowledgeId = service.getById(entity.getId()).getKnowledgeId();
|
List<AiDocument> list = service.list(QueryWrapper.create()
|
.eq(AiDocument::getKnowledgeId, knowledgeId)
|
.orderBy(getDefaultOrderBy())
|
);
|
|
list.removeIf(item -> item.getId().equals(entity.getId()));
|
if (orderNo >= list.size()) {
|
list.add(entity);
|
} else {
|
list.add(orderNo, entity);
|
}
|
|
List<AiDocument> updateList = new ArrayList<>();
|
for (int i = 0; i < list.size(); i++) {
|
AiDocument updateItem = new AiDocument();
|
updateItem.setId(list.get(i).getId());
|
updateItem.setOrderNo(i);
|
updateList.add(updateItem);
|
}
|
|
service.updateBatch(updateList);
|
}
|
|
return Result.success();
|
}
|
|
/**
|
* 文档存储到向量数据库
|
*
|
* @param entity 将要分割的文档
|
* @param splitterName 分割器名称
|
* @param chunkSize 分割器名称
|
* @param overlapSize 分段大小
|
* @param overlapSize 分段重叠大小
|
* @param regex 正则表达式
|
*/
|
protected Result storeDocument(AiDocument entity, String splitterName, int chunkSize, int overlapSize, String regex, Integer rowsPerChunk) {
|
entity = service.getById(entity.getId());
|
AiKnowledge knowledge = knowledgeService.getById(entity.getKnowledgeId());
|
if (knowledge == null) {
|
return Result.fail(1, "知识库不存在");
|
}
|
DocumentStore documentStore = knowledge.toDocumentStore();
|
if (documentStore == null){
|
return Result.fail(2, "向量数据库类型未设置");
|
}
|
// 设置向量模型
|
AiLlm aiLlm = aiLlmService.getById(knowledge.getVectorEmbedLlmId());
|
if (aiLlm == null) {
|
return Result.fail(3, "该知识库未配置大模型");
|
|
}
|
// 设置向量模型
|
Llm embeddingModel = aiLlm.toLlm();
|
documentStore.setEmbeddingModel(embeddingModel);
|
|
StoreOptions options = StoreOptions.ofCollectionName(knowledge.getVectorStoreCollection());
|
EmbeddingOptions embeddingOptions = new EmbeddingOptions();
|
embeddingOptions.setModel(aiLlm.getLlmModel());
|
options.setEmbeddingOptions(embeddingOptions);
|
if (entity.getId() != null) {
|
List<AiDocumentChunk> documentChunks = documentChunkService.list(QueryWrapper.create()
|
.eq(AiDocumentChunk::getDocumentId, entity.getId()));
|
|
if (documentChunks != null && !documentChunks.isEmpty()) {
|
List<BigInteger> chunkIds = documentChunks.stream()
|
.map(AiDocumentChunk::getId)
|
.collect(Collectors.toList());
|
|
//移除所有的文档分段内容
|
documentChunkService.removeByIds(chunkIds);
|
|
//移除向量数据库的所有内容
|
documentStore.delete(chunkIds);
|
}
|
}
|
|
// 设置分割器 todo 未来可以通过参数来指定分割器,不同的文档使用不同的分割器效果更好
|
documentStore.setDocumentSplitter(getDocumentSplitter(splitterName, chunkSize, overlapSize, regex, rowsPerChunk));
|
|
AiDocument finalEntity = entity;
|
AtomicInteger sort = new AtomicInteger(1);
|
// 设置文档ID生成器
|
documentStore.setDocumentIdGenerator(document -> {
|
AiDocumentChunk chunk = new AiDocumentChunk();
|
chunk.setContent(document.getContent());
|
chunk.setDocumentId(finalEntity.getId());
|
chunk.setKnowledgeId(finalEntity.getKnowledgeId());
|
chunk.setSorting(sort.get());
|
boolean success = documentChunkService.save(chunk);
|
sort.getAndIncrement();
|
|
if (success) {
|
return chunk.getId();
|
} else {
|
throw new IllegalStateException("Can not save document chunk");
|
}
|
});
|
|
Document document = Document.of(entity.getContent());
|
|
StoreResult result = documentStore.store(document, options);
|
if (!result.isSuccess()) {
|
LoggerFactory.getLogger(AiDocumentController.class).error("DocumentStore.store failed: " + result);
|
}
|
AiKnowledge aiKnowledge = new AiKnowledge();
|
aiKnowledge.setId(entity.getKnowledgeId());
|
// CanUpdateEmbedLlm false: 不能修改知识库的大模型 true: 可以修改
|
aiKnowledge.setCanUpdateEmbedding(false);
|
knowledgeService.updateById(aiKnowledge);
|
return Result.success();
|
}
|
|
public String getRootPath() {
|
if (StringUtil.hasText(this.fileUploadPath)) {
|
return this.fileUploadPath;
|
}
|
ClassPathResource fileResource = new ClassPathResource("/");
|
try {
|
return new File(fileResource.getFile(), "/public").getAbsolutePath();
|
} catch (IOException e) {
|
throw new RuntimeException(e);
|
}
|
}
|
|
public DocumentSplitter getDocumentSplitter (String splitterName, int chunkSize, int overlapSize, String regex, int excelRows){
|
|
if (StringUtil.noText(splitterName)) {
|
return null;
|
}
|
switch (splitterName) {
|
case "SimpleDocumentSplitter":
|
return new SimpleDocumentSplitter(chunkSize, overlapSize);
|
case "RegexDocumentSplitter":
|
return new RegexDocumentSplitter(regex);
|
case "SimpleTokenizeSplitter":
|
if (overlapSize == 0){
|
return new SimpleTokenizeSplitter(chunkSize);
|
} else {
|
return new SimpleTokenizeSplitter(chunkSize, overlapSize);
|
}
|
case "ExcelDocumentSplitter":
|
return new ExcelDocumentSplitter(excelRows);
|
default:
|
return null;
|
}
|
|
}
|
|
}
|