工作中经常要处理word文件,曾经是靠手写的,实在是笨,程序员应该有自己的方式
依赖
<dependencies>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.16</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.16</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.16</version>
</dependency>
</dependencies>
代码
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
import java.io.*;
public class WordUtil {
public static void main(String[] args) {
String filePath = "D:\\work\\(待提供)202001028 - 2版.docx";
String content = readWord(filePath);
String[] split = content.split("\n");
System.out.println(content);
}
private static String readWord(String path) {
String buffer = null;
try {
if (path.endsWith(".doc")) {
buffer = readDoc(path);
} else if (path.endsWith("docx")) {
buffer = readDocx(path);
} else {
System.out.println("此文件不是word文件!");
}
} catch (Exception e) {
e.printStackTrace();
}
return buffer;
}
private static String readDocx(String path) throws IOException, OpenXML4JException, XmlException {
try(OPCPackage opcPackage = POIXMLDocument.openPackage(path)){
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
return extractor.getText();
}
}
private static String readDoc(String path) throws IOException {
try(InputStream is = new FileInputStream(new File(path))){
WordExtractor ex = new WordExtractor(is);
return ex.getText();
}
}
}