问题描述
我创建了一个程序来读取和提取PDF文件中的文本...但是它会在执行过程中产生此异常.
I created a program to read and extract text from PDF files... But it producing this exception during execution..
java.io.IOException: Error: Expected a long type, actual='930[299'
at org.apache.pdfbox.pdfparser.BaseParser.readLong(BaseParser.java:1669)
at org.apache.pdfbox.pdfparser.PDFObjectStreamParser.parse(PDFObjectStreamParser.java:100)
at org.apache.pdfbox.cos.COSDocument.dereferenceObjectStreams(COSDocument.java:632)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:244)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1205)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1172)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1097)
at PatentAdder.main(PatentAdder.java:60)
这是我的代码:
import java.awt.Rectangle;
import java.io.File;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.io.filefilter.WildcardFileFilter;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.util.PDFTextStripperByArea;
public class PatentAdder {
/**
* @param args
*/
public static String patno,patit,patdate,patfilled,appno;
private static int File;
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
int cnt=0;
if( args.length == 1 )
{
// usage();
}
else
{
PDDocument document = null;
try
{
File dataDir = new File("F:/patents/test/tittest/USP2002w17/06/378/pdfs");
File[] files = dataDir.listFiles();
// String[] files = dataDir.list();
int count=0;
// System.out.println ("Satrt1");
for (File file : files) {
// System.out.println ("Satrt2");
File f = file;
if (!f.isDirectory()) {
document = PDDocument.load(f.getAbsolutePath());
if( document.isEncrypted() )
{
try
{
document.decrypt( "" );
}
catch( InvalidPasswordException e )
{
System.err.println( "Error: Document is encrypted with a password." );
System.exit( 1 );
}
} }
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition( true );
// Rectangle rectt = new Rectangle( 590, 108, 600, 100 ); // enlarge title
Rectangle rectt = new Rectangle( 288, 60, 222, 40 );
Rectangle rect = new Rectangle( 55, 108, 230, 600 ); // US-Patent title h40
// Rectangle rect = new Rectangle( 108, 210, 480, 499 ); //full enlarge
stripper.addRegion( "class1", rect );
stripper.addRegion("class2", rectt);
List allPages = document.getDocumentCatalog().getAllPages();
PDPage firstPage = (PDPage)allPages.get( 0 );
stripper.extractRegions( firstPage );
String title = "(?s)\\(54\\)\\s*([\\w\\s,-]+)|(?s)\\[54\\]\\s*([\\w\\s,-]+)";
String in ="((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\[\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Assignee:))|((?s)\\(\\d\\d\\)\\s+Inventor:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Inventor\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Assignee:))";
String as ="((?s)\\(\\d\\d\\)\\s+Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=\\(\\d*\\)\\s+Notice:))|((?s)\\(\\d\\d\\)\\s+Assignee:\\s*([\\-\\w\\d\\s,\\.\\(\\)-]+)*[\\w\\']*(?=\\n))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+);([\\w\\s.\\',();-]+)(?=Notice:))|(Assignee\\w*:\\s*\\w*([\\w\\d,.\\s)(-]+)(?=Notice:))";
String app_no ="(?s)\\(21\\)\\s*([\\w\\s,.://-]+)|(?s)\\[21\\]\\s*([\\w\\s,.://-]+)";
String filed ="((?s)\\(22\\)\\s*([\\w\\s,.://-]+))|((?s)\\(22\\)\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+))|((?s)\\[22\\]\\s*([\\w\\s,.://-]+)(?=\\s*\\n\\s*Related))";
String term ="((?s)\\s*Term\\s*([\\w\\s,.://-]+))|((?s)\\s*Term\\s*([\\w\\s,.://-]+))";
String pat_no = "(?s)\\s*Patent No\\.\\:\\s*([\\w\\d\\s,.://-]+)|(?s)\\s*Patent Number\\:\\s*([\\w\\d\\s,.://-]+)";
String pat_dt = "(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventor:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\(45\\)\\s*Date([\\*\\w\\d\\s,.://-]+)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\[\\d*\\]\\s+Inventor:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)(?=\\(\\d*\\)\\s+Inventors:)|(?s)\\[45\\]\\s*Date([\\*\\w\\d\\s,.://-]+)";
// System.out.println(rg);
String region = stripper.getTextForRegion( "class1" );
// System.out.println(region);
String regiont = stripper.getTextForRegion( "class2" );
Pattern p = Pattern.compile(in);
Matcher m = p.matcher(region);
Pattern p2 = Pattern.compile(as);
Matcher m2 = p2.matcher(region);
Pattern p3 = Pattern.compile(title);
Matcher m3 = p3.matcher(region);
Pattern p4 = Pattern.compile(pat_no);
Matcher m4 = p4.matcher(regiont);
Pattern p5 = Pattern.compile(app_no);
Matcher m5 = p5.matcher(region);
Pattern p6 = Pattern.compile(filed);
Matcher m6 = p6.matcher(region);
Pattern p7 = Pattern.compile(pat_dt);
Matcher m7 = p7.matcher(regiont);
while(m.find())
{
// System.out.println(m.group());
}
while(m2.find())
{
// System.out.println(m2.group());
}
while(m3.find())
{
// System.out.println(m3.group());
patit = m3.group().replace("(54)", " ");
patit = patit.trim();
}
while(m4.find())
{
// System.out.println(m4.group());
patno = m4.group().replace("Patent No.: ", " ");
patno = patno.replace("Patent No: ", " ");
patno = patno.replace("Patent", " ");
patno = patno.replace("No.:", " ");
patno = patno.replace("No:", " ");
patno = patno.replace("Number: ", " ");
patno = patno.replace("Number.: ", " ");
patno = patno.trim();
}
while(m5.find())
{
// System.out.println(m5.group());
appno = m5.group().replace("(21)", " ");
appno = appno.replace("Appl. No.: ", " ");
appno = appno.replace("Appl.", " ");
appno = appno.replace("No.", " ");
appno = appno.replace(":"," ");
appno = appno.trim();
}
while(m6.find())
{
// System.out.println(m6.group());
patfilled = m6.group().replace("(22)", " ");
patfilled = patfilled.replace("Filed", " ");
patfilled= patfilled.replace("PCT", " ");
patfilled = patfilled.replace(":", " ");
patfilled = patfilled.replace("\n", "");
patfilled= patfilled.trim();
}
while (m7.find())
{
patdate = m7.group().replace("(45) Date of Patent: ", " ");
patdate = patdate.replace("(45) Date of Patent.: ", " ");
patdate = patdate.replace("(45)", " ");
patdate = patdate.replace("Date", " ");
patdate = patdate.replace("of", " ");
patdate = patdate.replace("Patent.: ", " ");
patdate = patdate.replace("Patent: ", " ");
patdate = patdate.replace("Reissued", " ");
patdate = patdate.replace(":", " ");
patdate = patdate.replace("Patent", " ");
patdate = patdate.replace("*", " ");
patdate = patdate.trim();
}
System.out.println("File name:"+f.getName());
System.out.println(patno +"\n"+patit+"\n"+patdate+"\n"+patfilled+"\n"+appno+"\n-------");
// boolean st = addPatent (patno,patit,patdate,patfilled,appno);
// if ( st == true ) System.out.println(patno+" added");
// else System.out.println(patno+" not added");
count++;
}
System.out.print("-----Finised "+count+" Files------ \n");
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
catch (Exception e)
{
System.out.println(e.getStackTrace());
//System.out.println(e.getLocalizedMessage());
System.out.println(e.getMessage());
System.out.println(e.getCause());
//System.out.println(e.getClass());
e.printStackTrace();
}
}
static boolean addPatent(String pno,String ptitle,String pat_date ,String filed_date , String appl_no )
{
int i=0;
boolean status =false;
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
Connection con = DriverManager.getConnection("jdbc:mysql://localhost:3306/patent", "root","ragesh");
PreparedStatement st = con.prepareStatement("insert into patents_info values (?,?,?,?,?,?)");
st.setString(1, pno);
st.setString(2, ptitle);
st.setString(3,pat_date);
st.setString(4,filed_date);
st.setString(5,appl_no);
st.setInt(6,0);
i=st.executeUpdate();
if (i > 0) status= true;
}
catch (Exception e)
{
e.printStackTrace();
}
return status;
}
public static List<File> getAllChildFiles(File[] dir)
{
List<File> result = new ArrayList<File>();
for (File file : dir)
{
if (file.isDirectory())
{
File[] children = file.listFiles();
List<File> grandChildren = getAllChildFiles(children);
result.addAll(grandChildren);
}
else
{
result.add(file);
}
}
return result;
}
}
该程序最多可以进行一些迭代输出,但是像上面指定的那样会暂停和停止异常..
This programs gives output up to some iterations , but halts and thorw exception like above specified ..
带有异常的示例输出:
Sample output with Exception :
File name:06019327.pdf
Number: 6,019,327
[54] INSTALLATION STRUCTURE OF OUTDOOR
COMMUNICATION DRIVE
[45] Feb. 1, 2000
[22] Aug. 30, 1996
Related U.S. Application Data
[21] 08/704,920
-------
File name:06019328.pdf
Number: 6,019,328
[54] STAY-PUT PEGBOARD ACCESSORY
[45] Feb. 1, 2000
[22] Jan. 27, 1999
[21] 09/238,242
-------
File name:06019329.pdf
Number: 6,019,329
[54] CLAMPS
[45] Feb. 1, 2000
[22] Oct. 30, 1997
[21] 08/961,310
-------
File name:06019330.pdf
Number: 6,019,330
[54] ROOF GUARD DEVICE FOR LIFTING
OBJECTS ON TO A ROOF
[45] Feb. 1, 2000
[22] Nov. 20, 1997
[21] 08/974,866
-------
File name:06019331.pdf
Number: 6,019,331
[54] CANTILEVER BRACKET ASSEMBLY
[45] Feb. 1, 2000
[22] May 28, 1997
Related U.S. Application Data
[21] 08/865,587
-------
[Ljava.lang.StackTraceElement;@43a6684f
Error: Expected a long type, actual='930[299'
java.io.IOException: Error: Expected a long type, actual='930[299'
at org.apache.pdfbox.pdfparser.BaseParser.readLong(BaseParser.java:1669)
at org.apache.pdfbox.pdfparser.PDFObjectStreamParser.parse(PDFObjectStreamParser.java:100)
at org.apache.pdfbox.cos.COSDocument.dereferenceObjectStreams(COSDocument.java:632)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:244)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1205)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1172)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1097)
at PatentAdder.main(PatentAdder.java:60)
第二个问题
有时执行会冻结..也就是说,经过更多次迭代后,它只是显示闪烁的光标..为什么...?
Sometimes the execution freezes.. That is it just showing the blinking cursor after some more iterations .... Why... ?
File name:06019329.pdf
Number: 6,019,329
[54] CLAMPS
[45] Feb. 1, 2000
[22] Oct. 30, 1997
[21] 08/961,310
-------
File name:06019330.pdf
Number: 6,019,330
[54] ROOF GUARD DEVICE FOR LIFTING
OBJECTS ON TO A ROOF
[45] Feb. 1, 2000
[22] Nov. 20, 1997
[21] 08/974,866
-------
File name:06019331.pdf
Number: 6,019,331
[54] CANTILEVER BRACKET ASSEMBLY
[45] Feb. 1, 2000
[22] May 28, 1997
Related U.S. Application Data
[21] 08/865,587
-------
(__ cursor blinks on... and execution freezes )
请帮助我解决这2个问题:
Please help me to resolve this 2 issues:
JDK版本:1.6PDF Box 1.8.3
JDK version : 1.6PDF Box 1.8.3
推荐答案
这是由于PDFBox不遵循PDF对字母的引用引起的:)
This is caused by PDFBox not following the PDF Reference to the letter :)
PDF令牌流中的令牌可以用空格定界(大多数编程语言通常如此),但是也可以隐式地定界:因为下一个字符是它自己的定界符,因为它引入了特殊的功能.因此,遇到诸如
Tokens in a PDF token stream may be delimited by white space (as usual for most programming language), but also implicitly: because the next character is a delimiter of its own, since it introduces a special function. Therefore, it's totally valid -- and certainly not unusual -- to encounter constructions such as
/A[123/B(C)]
这完全等同于稍长的时间
which is entirely equivalent to the slightly longer
/A [ 123 /B (C) ]
来自ISO"PDF 32000-1:2008", 7.2.2字符集:
From ISO "PDF 32000-1:2008", 7.2.2 Character Set:
显示的空格字符 [...]
定界符(,),<,>,[,],{,},/和%是特殊的[..]
The delimiter characters (, ), <, >, [, ], {, }, /, and % are special [..]
原始代码显示了当前的实现(取自 http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java ):
The original code shows the current implementation (taken from http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java):
/**
1681 * This method is used to read a token by the {@linkplain #readInt()} method and the {@linkplain #readLong()} method.
1682 *
1683 * @return the token to parse as integer or long by the calling method.
1684 * @throws IOException throws by the {@link #pdfSource} methods.
1685 */
1686 protected final StringBuilder readStringNumber() throws IOException
1687 {
1688 int lastByte = 0;
1689 StringBuilder buffer = new StringBuilder();
1690 while( (lastByte = pdfSource.read() ) != 32 &&
1691 lastByte != 10 &&
1692 lastByte != 13 &&
1693 lastByte != 60 && //see sourceforge bug 1714707
1694 lastByte != 0 && //See sourceforge bug 853328
1695 lastByte != -1 )
1696 {
1697 buffer.append( (char)lastByte );
1698 }
1699 if( lastByte != -1 )
1700 {
1701 pdfSource.unread( lastByte );
1702 }
1703 return buffer;
1704 }
下一个字符"是针对7.2.2中表1中的空白字符(从上至下,空格",换行",回车"和Nul字符)进行测试的.他们仍然缺少换页"代码0x0C
,以及常见的"Tab" 0x09
,但是确实可以测试文件结尾(-1)和(60
),后者可能是因为以前有人遇到过类似的错误.(我无法找到原始错误报告#1714707,但我可以推断出它一定与您的问题相似.)
The 'next character' is tested against the whitespace characters from Table 1 in 7.2.2 (top to bottom, "Space", "Line Feed", "Carriage Return", and the Nul character -- though they are still missing the "Form Feed" code 0x0C
and, very odd, the common "Tab" 0x09
. They do test, however, for an end-of-file (the -1) and <
(60
), the latter probably because someone ran into a similar bug before. (I could not locate the original bug report #1714707 but I can infer it must have been similar to your issue.)
必须通过添加以下字符来完成此列表,这些字符是从7.2.2中的表2复制的 verbatim :
This list must be completed by adding the following characters, copied verbatim from Table 2 in 7.2.2:
Table 2 – Delimiter characters
Glyph Decimal Hexadecimal Octal Name
( 40 28 50 LEFT PARENTHESIS
) 41 29 51 RIGHT PARENTHESIS [1]
< 60 3C 60 LESS-THAN SIGN
> 62 3E 62 GREATER-THAN SIGN
[ 91 5B 133 LEFT SQUARE BRACKET
] 93 5D 135 RIGHT SQUARE BRACKET
{ 123 7B 173 LEFT CURLY BRACKET
} 125 7D 175 RIGHT CURLY BRACKET
/ 47 2F 57 SOLIDUS
% 37 25 45 PERCENT SIGN
奇怪的是{
和}
,因为目前它们仅出现在PostScript代码片段中,它们不是基础对象,而是包含在stream
中.但是,从历史上看,也许它们是为将来的扩展而保留"的. (现在不再是问题,因为PDF格式已作为ISO规范冻结了.)
The odd ones out are {
and }
since, currently, they only appear inside PostScript snippets, and those are not base objects but contained inside a stream
. But perhaps they were historically "reserved for future expansion" (which should no longer be an issue, now the PDF format has been frozen as an ISO specification).
此外,字符%
本身是一个定界符,但它需要一些特殊的处理并引入注释:
Also, the character %
in itself is a delimiter, but it needs some special handling as well as it introduces a comment:
(请注意,这里存在一些歧义:
(Note there is a little ambiguity there:
该应该不是必需的,因为上一行已经说到注释在行尾之前 之前结束了;因此,行尾本身应该保留在输入流中,并因此充当分隔符.也许只不过是一条悬而未决的方法而已.)
which should not be necessary, because the previous line already says the comment ends before the end-of-line; and so the end-of-line itself ought to remain in the input stream and thus act as a separator. Perhaps nothing more than a case of a belt-and-suspenders approach.)
[1]复习:实际上,右括号是多余的.它只能在匹配的开头括号后出现,并引入一个字符串.一次查看一个令牌,您永远都不会遇到流浪)
-如果您这样做,则表示PDF流格式错误.
[1] On reviewing: actually, the closing parenthesis is redundant. It can only occur after a matching opening parentheses, and that introduces a string. Viewed one token at a time, you should never encounter a stray )
-- if you do, that indicates a malformed PDF stream.
这篇关于这是什么java.io.IOException:错误:预期为长类型,actual ='930 [299'告诉我们?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!