我创建了一个将PDF转换为Excel的程序。转换需要很长时间(100页= 10分钟)。它可以正常运行约15-20分钟,此后读取PDPage时将发生错误。
Java GC是否可以在程序结束之前“清除”变量?
编码:
private class Search_Text implements Runnable {
private int x, y, width, height;
private PDPage pdPage;
private Object lock;
private ArrayList<Object[]> result;
private PDFTextStripperByArea strip;
public Search_Text(int x, int y, int width, int height, PDPage pdPage, Object lock) throws IOException {
this.x = x;
this.y = y;
this.width = width;
this.height = height;
this.pdPage = pdPage;
this.lock = lock;
this.result = new ArrayList<>();
this.strip = new PDFTextStripperByArea();
}
@Override
public void run() {
if (height < 10) {
int upper = y;
int bottom = 1;
ArrayList<Object[]> st = new ArrayList<>();
String str = "";
while (upper + bottom <= y + height) {
strip.addRegion("cell", new Rectangle(x, upper, width, bottom));
//System.out.println("prova.Pdf2Excell.log_extract()BEFORE LOCK" + init);
synchronized (lock) {
try {
strip.extractRegions(pdPage);
} catch (IOException ex) {
Logger.getLogger(Pdf2Excell.class.getName()).log(Level.SEVERE, null, ex);
}
}
str = strip.getTextForRegion("cell");
if (!emptyString(str)) {
st.add(new Object[]{str, upper + bottom, upper});
upper += bottom;
bottom = 1;
while (upper + bottom < height + y && !emptyString(str)) {
strip.addRegion("cell", new Rectangle(x, upper, width, bottom));
synchronized (lock) {
try {
strip.extractRegions(pdPage);
} catch (IOException ex) {
Logger.getLogger(Pdf2Excell.class.getName()).log(Level.SEVERE, null, ex);
}
}
str = strip.getTextForRegion("cell");
upper++;
//System.out.println("prova.Pdf2Excell.pdf2EX()DENTRO");
}
} else {
bottom += 1;
//System.out.println("prova.Pdf2Excell.pdf2EX()UPPER;;"+upper+";;BOTTOM;;" + bottom);
}
if (upper == y) {
st.add(new Object[]{"", y + height, upper});
}
result = st;
}
} else {
try {
int half_rec = height / 2;
Rectangle first_rec = new Rectangle(x, y, width, half_rec);
Rectangle last_rec = new Rectangle(x, y + half_rec, width, height - half_rec);
Search_Text first_search = new Search_Text(x, y, width, half_rec, pdPage, lock);
Search_Text last_search = new Search_Text(x, y + half_rec, width, height - half_rec, pdPage, lock);
Thread first = new Thread(first_search);
Thread last = new Thread(last_search);
strip.addRegion("cell", first_rec);
synchronized (lock) {
strip.extractRegions(pdPage);
}
String temp = strip.getTextForRegion("cell");
if (!emptyString(temp)) {
first.start();
}
strip.addRegion("cell", last_rec);
synchronized (lock) {
strip.extractRegions(pdPage);
}
temp = strip.getTextForRegion("cell");
if (!emptyString(temp)) {
last.start();
}
first.join();
last.join();
result = first_search.getResult();
ArrayList<Object[]> temp_res = last_search.getResult();
for (int i = 0; i < temp_res.size(); i++) {
result.add(temp_res.get(i));
}
} catch (InterruptedException | IOException ex) {
Logger.getLogger(Pdf2Excell.class.getName()).log(Level.SEVERE, null, ex);
}
}
}
这是错误消息:
Exception in thread "Thread-214418" java.lang.RuntimeException: java.io.IOException: RandomAccessBuffer already closed
at org.apache.pdfbox.pdfparser.PDFStreamParser$1.tryNext(PDFStreamParser.java:198)
at org.apache.pdfbox.pdfparser.PDFStreamParser$1.hasNext(PDFStreamParser.java:205)
at org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:255)
at org.apache.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:235)
at org.apache.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:215)
at org.apache.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:458)
at org.apache.pdfbox.util.PDFTextStripperByArea.extractRegions(PDFTextStripperByArea.java:153)
at prova.Pdf2Excell$Search_Text.run(Pdf2Excell.java:954)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.IOException: RandomAccessBuffer already closed
at org.apache.pdfbox.io.RandomAccessBuffer.checkClosed(RandomAccessBuffer.java:325)
at org.apache.pdfbox.io.RandomAccessBuffer.seek(RandomAccessBuffer.java:105)
at org.apache.pdfbox.io.RandomAccessFileInputStream.read(RandomAccessFileInputStream.java:96)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:284)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
at java.io.FilterInputStream.read(FilterInputStream.java:83)
at java.io.PushbackInputStream.read(PushbackInputStream.java:139)
at org.apache.pdfbox.io.PushBackInputStream.read(PushBackInputStream.java:90)
at org.apache.pdfbox.io.PushBackInputStream.peek(PushBackInputStream.java:68)
at org.apache.pdfbox.pdfparser.PDFStreamParser.hasNextSpaceOrReturn(PDFStreamParser.java:560)
at org.apache.pdfbox.pdfparser.PDFStreamParser.parseNextToken(PDFStreamParser.java:408)
at org.apache.pdfbox.pdfparser.PDFStreamParser.parseNextToken(PDFStreamParser.java:374)
at org.apache.pdfbox.pdfparser.PDFStreamParser.access$000(PDFStreamParser.java:49)
at org.apache.pdfbox.pdfparser.PDFStreamParser$1.tryNext(PDFStreamParser.java:193)
... 8 more
最佳答案
PDFBox已开发为每个文档单线程使用,而OP使用多个线程访问同一文档。尽管这可能仍然有效(因为这是只读用例),但必须进行正确的同步。
这种同步很可能会使一切变得更加缓慢。因此,解决方案是完全使用不同的架构,即
获取PDFTextStripper
,覆盖writeString(String text, List<TextPosition> textPositions)
,并从该List<TextPosition> textPositions
收集所需的信息。 TextPosition
包含有关一小段文本(我认为通常是单个字母)的信息,包括其位置。
结果竟然是
快4倍