使用以下代码将PDF文件写入HTML文件格式时...
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.fit.pdfdom.PDFDomTree;
import org.fit.pdfdom.PDFDomTreeConfig;
import org.fit.pdfdom.resource.HtmlResourceHandler;
import org.fit.pdfdom.resource.SaveResourceToDirHandler;
public class PdfToHtmlConverter {
public String pdfToHtmlFileWriter(File file, String outputFilePath, String outputFileName) throws InvalidPasswordException, IOException, ParserConfigurationException {
// load the PDF file using PDFBox
PDDocument pdf = PDDocument.load(file);
PDFDomTreeConfig config = PDFDomTreeConfig.createDefaultConfig();
HtmlResourceHandler fontHandler = new SaveResourceToDirHandler();
config.setFontHandler(fontHandler);
HtmlResourceHandler imageHandler = new SaveResourceToDirHandler();
config.setImageHandler(imageHandler);
// create the DOM parser
PDFDomTree parser = new PDFDomTree();
// parse the file and get the DOM Document
String outputFile = outputFilePath + File.separator + outputFileName + ".html";
try (Writer woutput = new PrintWriter(new BufferedWriter(new FileWriter(outputFile)))) {
parser.writeText(pdf, woutput);
} catch(Exception e) {
e.printStackTrace();
}
pdf.close();
return outputFile;
}
}
并且build.gradle文件具有以下依赖项列表...
dependencies {
compile fileTree(dir: 'lib', include: ['*.jar'])
compile group: 'org.apache.pdfbox', name: 'pdfbox', version: '2.0.6'
compile group: 'org.apache.pdfbox', name: 'pdfbox-tools', version: '2.0.6'
compile group: 'org.apache.logging.log4j', name: 'log4j', version: '2.11.0'
compile group: 'org.apache.logging.log4j', name: 'log4j-api', version: '2.6.1'
compile group: 'org.apache.logging.log4j', name: 'log4j-core', version: '2.6.1'
compile group: 'javax.mail', name: 'mail', version: '1.4.1'
compile group: 'org.bouncycastle', name: 'bcmail-jdk15', version: '1.46'
compile group: 'org.bouncycastle', name: 'bcprov-jdk15on', version: '1.47'
compile group: 'net.sf.ehcache', name: 'ehcache-core', version: '2.4.6'
compile group: 'com.google.guava', name: 'guava', version: '11.0.2'
compile group: 'redis.clients', name: 'jedis', version: '2.9.0'
compile group: 'org.apache.poi', name: 'poi-ooxml', version: '3.17'
compile group: 'org.apache.poi', name: 'poi', version: '3.17'
compile group: 'net.sf.cssbox', name: 'pdf2dom', version: '1.7'
compile group: 'com.levigo.jbig2', name: 'levigo-jbig2-imageio', version: '1.6.5'
compile 'com.google.code.gson:gson:2.8.2'
compile 'org.json:json:20180130'
}
糟糕!从JDK获得以下消息...
[org.glassfish.jersey.server.ContainerException: java.util.ServiceConfigurationError: com.levigo.jbig2.util.log.LoggerBridge: Provider com.levigo.jbig2.util.log.JDKLoggerBridge not a subtype] with root cause
java.util.ServiceConfigurationError: com.levigo.jbig2.util.log.LoggerBridge: Provider com.levigo.jbig2.util.log.JDKLoggerBridge not a subtype
at java.util.ServiceLoader.fail(Unknown Source)
at java.util.ServiceLoader.access$300(Unknown Source)
at java.util.ServiceLoader$LazyIterator.nextService(Unknown Source)
at java.util.ServiceLoader$LazyIterator.next(Unknown Source)
at java.util.ServiceLoader$1.next(Unknown Source)
at com.levigo.jbig2.util.log.LoggerFactory.getLogger(LoggerFactory.java:42)
at com.levigo.jbig2.util.log.LoggerFactory.getLogger(LoggerFactory.java:48)
at com.levigo.jbig2.JBIG2ImageReader.<clinit>(JBIG2ImageReader.java:45)
at com.levigo.jbig2.JBIG2ImageReaderSpi.createReaderInstance(JBIG2ImageReaderSpi.java:116)
at javax.imageio.spi.ImageReaderSpi.createReaderInstance(Unknown Source)
at javax.imageio.ImageIO$ImageReaderIterator.next(Unknown Source)
at javax.imageio.ImageIO$ImageReaderIterator.next(Unknown Source)
at org.apache.pdfbox.filter.Filter.findImageReader(Filter.java:133)
at org.apache.pdfbox.filter.JBIG2Filter.decode(JBIG2Filter.java:54)
at org.apache.pdfbox.cos.COSInputStream.create(COSInputStream.java:69)
at org.apache.pdfbox.cos.COSStream.createInputStream(COSStream.java:167)
at org.apache.pdfbox.pdmodel.common.PDStream.createInputStream(PDStream.java:235)
at org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject.<init>(PDImageXObject.java:125)
at org.apache.pdfbox.pdmodel.graphics.PDXObject.createXObject(PDXObject.java:70)
at org.apache.pdfbox.pdmodel.PDResources.getXObject(PDResources.java:409)
at org.fit.pdfdom.PDFBoxTree.processFontResources(PDFBoxTree.java:397)
at org.fit.pdfdom.PDFBoxTree.updateFontTable(PDFBoxTree.java:361)
at org.fit.pdfdom.PDFDomTree.updateFontTable(PDFDomTree.java:544)
at org.fit.pdfdom.PDFBoxTree.processPage(PDFBoxTree.java:206)
at org.apache.pdfbox.text.PDFTextStripper.processPages(PDFTextStripper.java:319)
at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:266)
at org.fit.pdfdom.PDFDomTree.createDOM(PDFDomTree.java:218)
at org.fit.pdfdom.PDFDomTree.writeText(PDFDomTree.java:194)
at com.pype.html.converter.PdfToHtmlConverter.pdfToHtmlFileWriter(PdfToHtmlConverter.java:91)
at com.pype.drawings.slicing.VerticalSlicer.convertCompleteSinglePagePdftoHtml(VerticalSlicer.java:540)
at com.pype.drawings.slicing.VerticalSlicer.convertCompletePdfPageToHtml(VerticalSlicer.java:104)
at com.pype.pdf.schedules.extractor.ExtractSchedules.generateHtmlFiles(ExtractSchedules.java:344)
at com.pype.pdf.schedules.extractor.ExtractSchedules.getIdentifiedSchedulesUsingElements(ExtractSchedules.java:218)
at com.pype.solr.rest.api.ExtractPDFDrawing.processUploadedPDFFile(ExtractPDFDrawing.java:511)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at org.glassfish.jersey.server.model.internal.ResourceMethodInvocationHandlerFactory$1.invoke(ResourceMethodInvocationHandlerFactory.java:81)
at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher$1.run(AbstractJavaResourceMethodDispatcher.java:144)
at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.invoke(AbstractJavaResourceMethodDispatcher.java:161)
at org.glassfish.jersey.server.model.internal.JavaResourceMethodDispatcherProvider$TypeOutInvoker.doDispatch(JavaResourceMethodDispatcherProvider.java:205)
at org.glassfish.jersey.server.model.internal.AbstractJavaResourceMethodDispatcher.dispatch(AbstractJavaResourceMethodDispatcher.java:99)
at org.glassfish.jersey.server.model.ResourceMethodInvoker.invoke(ResourceMethodInvoker.java:389)
at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:347)
at org.glassfish.jersey.server.model.ResourceMethodInvoker.apply(ResourceMethodInvoker.java:102)
at org.glassfish.jersey.server.ServerRuntime$2.run(ServerRuntime.java:326)
at org.glassfish.jersey.internal.Errors$1.call(Errors.java:271)
at org.glassfish.jersey.internal.Errors$1.call(Errors.java:267)
at org.glassfish.jersey.internal.Errors.process(Errors.java:315)
at org.glassfish.jersey.internal.Errors.process(Errors.java:297)
at org.glassfish.jersey.internal.Errors.process(Errors.java:267)
at org.glassfish.jersey.process.internal.RequestScope.runInScope(RequestScope.java:317)
at org.glassfish.jersey.server.ServerRuntime.process(ServerRuntime.java:305)
at org.glassfish.jersey.server.ApplicationHandler.handle(ApplicationHandler.java:1154)
at org.glassfish.jersey.servlet.WebComponent.serviceImpl(WebComponent.java:473)
at org.glassfish.jersey.servlet.WebComponent.service(WebComponent.java:427)
at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:388)
at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:341)
at org.glassfish.jersey.servlet.ServletContainer.service(ServletContainer.java:228)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:231)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.apache.tomcat.websocket.server.WsFilter.doFilter(WsFilter.java:53)
at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:193)
at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:166)
at org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:199)
at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:96)
at org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:502)
at org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:140)
at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:81)
at org.apache.catalina.valves.AbstractAccessLogValve.invoke(AbstractAccessLogValve.java:651)
at org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:87)
at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:342)
at org.apache.coyote.http11.Http11Processor.service(Http11Processor.java:501)
at org.apache.coyote.AbstractProcessorLight.process(AbstractProcessorLight.java:66)
at org.apache.coyote.AbstractProtocol$ConnectionHandler.process(AbstractProtocol.java:754)
at org.apache.tomcat.util.net.NioEndpoint$SocketProcessor.doRun(NioEndpoint.java:1376)
at org.apache.tomcat.util.net.SocketProcessorBase.run(SocketProcessorBase.java:49)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at org.apache.tomcat.util.threads.TaskThread$WrappingRunnable.run(TaskThread.java:61)
at java.lang.Thread.run(Unknown Source)
搜索更多有关此错误的信息后,没有任何线索。如果有人有任何想法,请对此提出一些建议。
谢谢
最佳答案
请更新至最新版本的jbig2解码器,即3.0.2。由于levigo solutions GmbH,jbig2解码器现在已成为Apache PDFBox的一部分。对于Maven,请使用以下命令:
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jbig2-imageio</artifactId>
<version>3.0.2</version>
</dependency>
或使用direct download。