18586361686
2025-05-23 1bf0fdbff15c79943aef117c5469840c46ad3266
feat: 知识库导入支持 ppt , pptx
close #IC6IOY
1个文件已添加
118 ■■■■■ 已修改文件
aiflowy-commons/aiflowy-common-ai/src/main/java/tech/aiflowy/common/ai/PowerPointDocumentParser.java 118 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
aiflowy-commons/aiflowy-common-ai/src/main/java/tech/aiflowy/common/ai/PowerPointDocumentParser.java
New file
@@ -0,0 +1,118 @@
package tech.aiflowy.common.ai;
import com.agentsflex.core.document.Document;
import com.agentsflex.core.document.DocumentParser;
import org.apache.poi.hslf.usermodel.HSLFSlideShow;
import org.apache.poi.hslf.usermodel.HSLFTextShape;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
import org.apache.poi.xslf.usermodel.XSLFShape;
import org.apache.poi.xslf.usermodel.XSLFTextShape;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class PowerPointDocumentParser implements DocumentParser {
    @Override
    public Document parse(InputStream inputStream) {
        try {
            // 尝试先按PPTX格式解析
            try {
                return parsePPTX(inputStream);
            } catch (Exception e) {
                // 如果不是PPTX格式,尝试按PPT格式解析
                return parsePPT(inputStream);
            }
        } catch (IOException e) {
            throw new RuntimeException("Failed to parse PowerPoint file", e);
        }
    }
    private Document parsePPTX(InputStream inputStream) throws IOException {
        StringBuilder content = new StringBuilder();
        Map<String, Object> metadata = new HashMap<>();
        List<Map<String, String>> slides = new ArrayList<>();
        try (XMLSlideShow slideShow = new XMLSlideShow(inputStream)) {
            metadata.put("slide_count", slideShow.getSlides().size());
            for (org.apache.poi.xslf.usermodel.XSLFSlide slide : slideShow.getSlides()) {
                Map<String, String> slideContent = new HashMap<>();
                StringBuilder slideText = new StringBuilder();
                // 提取幻灯片标题
                String title = slide.getTitle();
                if (title != null && !title.isEmpty()) {
                    slideContent.put("title", title);
                    slideText.append("Slide Title: ").append(title).append("\n");
                }
                // 提取幻灯片文本内容
                for (XSLFShape shape : slide.getShapes()) {
                    if (shape instanceof XSLFTextShape) {
                        String text = ((XSLFTextShape) shape).getText();
                        if (text != null && !text.isEmpty()) {
                            slideText.append(text).append("\n");
                        }
                    }
                }
                slideContent.put("content", slideText.toString());
                slideContent.put("slide_number", String.valueOf(slide.getSlideNumber()));
                slides.add(slideContent);
                content.append(slideText).append("\n\n");
            }
        }
        Document document = new Document(content.toString());
        document.addMetadata("slides", slides);
        document.addMetadata("format", "pptx");
        return document;
    }
    private Document parsePPT(InputStream inputStream) throws IOException {
        StringBuilder content = new StringBuilder();
        Map<String, Object> metadata = new HashMap<>();
        List<Map<String, String>> slides = new ArrayList<>();
        try (HSLFSlideShow slideShow = new HSLFSlideShow(inputStream)) {
            metadata.put("slide_count", slideShow.getSlides().size());
            for (org.apache.poi.hslf.usermodel.HSLFSlide slide : slideShow.getSlides()) {
                Map<String, String> slideContent = new HashMap<>();
                StringBuilder slideText = new StringBuilder();
                // ✅ 正确获取标题文本
                String titleText = slide.getTitle();
                if (titleText != null && !titleText.isEmpty()) {
                    slideContent.put("title", titleText);
                    slideText.append("Slide Title: ").append(titleText).append("\n");
                }
                // 提取幻灯片其他文本内容
                for (org.apache.poi.hslf.usermodel.HSLFShape shape : slide.getShapes()) {
                    if (shape instanceof HSLFTextShape) {
                        String text = ((HSLFTextShape) shape).getText();
                        if (text != null && !text.isEmpty()) {
                            slideText.append(text).append("\n");
                        }
                    }
                }
                slideContent.put("content", slideText.toString());
                slideContent.put("slide_number", String.valueOf(slide.getSlideNumber() + 1));
                slides.add(slideContent);
                content.append(slideText).append("\n\n");
            }
        }
        Document document = new Document(content.toString());
        document.addMetadata("slides", slides);
        document.addMetadata("format", "ppt");
        return document;
    }
}