POI按照Word文档指定标题进行拆分
Pom配置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.taowd</groupId>
<artifactId>Hello_Word</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.8</version>
</dependency>
</dependencies>
</project>
代码实现
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
/** * 根据文档将所有段落都遍历一遍,找到对应标题的,讲其他元素删除,保留原有样式 <br/> * https://blog.csdn.net/qq_37201321/article/details/91864843 */
public class HelloWorld {
public static void main(String[] args) throws IOException {
readAndWriterTest4();
}
public static void readAndWriterTest4() throws IOException {
String str = "";
try {
File file = new File("test.docx");
FileInputStream fis = new FileInputStream(file);
XWPFDocument xdoc = new XWPFDocument(fis);
List<IBodyElement> bodyElements = xdoc.getBodyElements();
int count = bodyElements.size();
System.out.println(count);
// for (int i = 0; i < count; i++) {
// IBodyElement bodyElement = bodyElements.get(i);
// BodyElementType bet = bodyElement.getElementType();
//
// }
int start = 0;
int end = bodyElements.size();
for (int i = 0; i < count; i++) {
IBodyElement bodyElement = bodyElements.get(i);
BodyElementType bet = bodyElement.getElementType();
if (bet == BodyElementType.PARAGRAPH) {
// 段落
XWPFParagraph paragraph = ((XWPFParagraph)bodyElement);
// 判断该段落是否设置了大纲级别
String control = getTitleLvl(xdoc, paragraph);
if (paragraph.getText().equals("系统设计") && control.equals("0")) {
start = i;
}
if (paragraph.getText().equals("系统功能的设计与实现") && control.equals("0")) {
System.out.println(paragraph.getText());
System.out.println(i);
end = i - 1;
}
}
}
System.out.println("#################################################################");
// for (int i = 0; i < count; i++) {
// if (i < start || i > end) {
// continue;
// }
// IBodyElement bodyElement = bodyElements.get(i);
// BodyElementType bet = bodyElement.getElementType();
// if (bet == BodyElementType.TABLE) {
// // 表格
// System.out.println("table" + bodyElement.getPart());
// } else {
// // 段落
// XWPFParagraph paragraph = ((XWPFParagraph)bodyElement);
// System.out.println(paragraph.getText());
// }
// }
System.out.println("#################################################################");
// XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
// String doc1 = extractor.getText();
// System.out.println(doc1);
fis.close();
File file_new = new File("test.docx");
FileInputStream fis_new = new FileInputStream(file_new);
XWPFDocument out_doc = new XWPFDocument(fis_new);
System.out.println("start:" + start);
System.out.println("end:" + end);
for (int i = count - 1; i >= end; i--) {
out_doc.removeBodyElement(i);
// System.out.println(i + "==" + out_doc.removeBodyElement(i));
}
System.out.println("*****************");
// 写入一个新文件
for (int i = start; i >= 0; i--) {
out_doc.removeBodyElement(i);
// System.out.println(i + "==" + out_doc.removeBodyElement(i));
}
File new_file = new File("new_test1111.docx");
FileOutputStream out = new FileOutputStream(new_file);
out_doc.write(out);
} catch (Exception e) {
e.printStackTrace();
}
}
/** * 输出大纲,返回的布尔值用于判断内一级是否需要判断大纲 * * @param paragraph * @param number * @throws IOException */
private static int num = 0;
/** * Word中的大纲级别,可以通过getPPr().getOutlineLvl()直接提取,但需要注意,Word中段落级别,通过如下三种方式定义: 1、直接对段落进行定义; 2、对段落的样式进行定义; * 3、对段落样式的基础样式进行定义。 因此,在通过“getPPr().getOutlineLvl()”提取时,需要依次在如上三处读取。 * * @param doc * @param para * @return */
private static String getTitleLvl(XWPFDocument doc, XWPFParagraph para) {
String titleLvl = "";
try {
// 判断该段落是否设置了大纲级别
if (para.getCTP().getPPr().getOutlineLvl() != null) {
// System.out.println("getCTP()");
// System.out.println(para.getParagraphText());
// System.out.println(para.getCTP().getPPr().getOutlineLvl().getVal());
return String.valueOf(para.getCTP().getPPr().getOutlineLvl().getVal());
}
} catch (Exception e) {
}
try {
// 判断该段落的样式是否设置了大纲级别
if (doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl() != null) {
// System.out.println("getStyle");
// System.out.println(para.getParagraphText());
// System.out.println(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl().getVal());
return String
.valueOf(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getPPr().getOutlineLvl().getVal());
}
} catch (Exception e) {
}
try {
// 判断该段落的样式的基础样式是否设置了大纲级别
if (doc.getStyles().getStyle(doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal())
.getCTStyle().getPPr().getOutlineLvl() != null) {
// System.out.println("getBasedOn");
// System.out.println(para.getParagraphText());
String styleName = doc.getStyles().getStyle(para.getStyle()).getCTStyle().getBasedOn().getVal();
// System.out.println(doc.getStyles().getStyle(styleName).getCTStyle().getPPr().getOutlineLvl().getVal());
return String
.valueOf(doc.getStyles().getStyle(styleName).getCTStyle().getPPr().getOutlineLvl().getVal());
}
} catch (Exception e) {
}
try {
if (para.getStyleID() != null) {
return para.getStyleID();
}
} catch (Exception e) {
}
return titleLvl;
}
/** * 去除文档中可能存在的转义符 * * @param str */
public static String unescapeJava(String str) {
String dest = "";
if (str != null) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(str);
dest = m.replaceAll("");
}
return dest;
}
class NewWord {
/** * 新文档开始索引 */
int strat;
/** * 新文档结束索引 */
int end;
/** * 新文档名称,根据大纲级别导出新文档 */
String name;
}
}
原文作者:Taowiedong
原文地址: https://blog.csdn.net/taoweidong1/article/details/119988747
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
原文地址: https://blog.csdn.net/taoweidong1/article/details/119988747
本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
相关文章