Closed. This question is opinion-based。它当前不接受答案。
想要改善这个问题吗?更新问题,以便editing this post用事实和引用来回答。
2年前关闭。
Improve this question
我想知道一种提取个别措施的方法。我不确定是否已经存在用于此目的的算法,所以我考虑过从左到右扫描乐谱,从一行测量值的上方和下方提取所有空白。
我没有在寻找将乐谱转换成MusicXML或提取其他有用信息的方法。不,基本上我要处理的是常规文件。我需要将段落分开。我对本段传达的信息不感兴趣,而只是将它们与文档区域分开进行分块。在这种情况下,一段将是一条措施。我不需要单独的小节,但是在乐谱的每一行都需要所有小节。
这是我希望从整张乐谱中获得的输出之一,但没有标题, Composer 等。
这是Apache PDFBox中可用的示例代码。
现在要裁剪图像,您可以使用:
在this page和project中可用
还有其他选项可以解析pdf文件中的图像,看看at this code特别是this
想要改善这个问题吗?更新问题,以便editing this post用事实和引用来回答。
2年前关闭。
Improve this question
我想知道一种提取个别措施的方法。我不确定是否已经存在用于此目的的算法,所以我考虑过从左到右扫描乐谱,从一行测量值的上方和下方提取所有空白。
我没有在寻找将乐谱转换成MusicXML或提取其他有用信息的方法。不,基本上我要处理的是常规文件。我需要将段落分开。我对本段传达的信息不感兴趣,而只是将它们与文档区域分开进行分块。在这种情况下,一段将是一条措施。我不需要单独的小节,但是在乐谱的每一行都需要所有小节。
这是我希望从整张乐谱中获得的输出之一,但没有标题, Composer 等。
最佳答案
假设您在PDF文件中有活页乐谱,我将使用Apache PDFBox从包含活页乐谱的输入PDF文件中获取图像,然后找到所需的整个条形的坐标,并使用选定的图像定义裁剪图像的坐标并对其进行操作,直到获得所需的结果。
PDDocument document = null;
document = PDDocument.load(inFile);
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2OutputStream(/* some output stream */);
}
}
}
这是Apache PDFBox中可用的示例代码。
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
/**
* This will read a read pdf and extract images. <br/><br/>
*
* usage: java org.apache.pdfbox.ExtractImages <pdffile> <password> [imageprefix]
*
* @author <a href="mailto:[email protected]">Ben Litchfield</a>
* @version $Revision: 1.7 $
*/
public class ExtractImages
{
private int imageCounter = 1;
private static final String PASSWORD = "-password";
private static final String PREFIX = "-prefix";
private static final String ADDKEY = "-addkey";
private static final String NONSEQ = "-nonSeq";
private ExtractImages()
{
}
/**
* This is the entry point for the application.
*
* @param args The command-line arguments.
*
* @throws Exception If there is an error decrypting the document.
*/
public static void main( String[] args ) throws Exception
{
ExtractImages extractor = new ExtractImages();
extractor.extractImages( args );
}
private void extractImages( String[] args ) throws Exception
{
if( args.length < 1 || args.length > 4 )
{
usage();
}
else
{
String pdfFile = null;
String password = "";
String prefix = null;
boolean addKey = false;
boolean useNonSeqParser = false;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( PREFIX ) )
{
i++;
if( i >= args.length )
{
usage();
}
prefix = args[i];
}
else if( args[i].equals( ADDKEY ) )
{
addKey = true;
}
else if( args[i].equals( NONSEQ ) )
{
useNonSeqParser = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
}
}
if(pdfFile == null)
{
usage();
}
else
{
if( prefix == null && pdfFile.length() >4 )
{
prefix = pdfFile.substring( 0, pdfFile.length() -4 );
}
PDDocument document = null;
try
{
if (useNonSeqParser)
{
document = PDDocument.loadNonSeq(new File(pdfFile), null, password);
}
else
{
document = PDDocument.load( pdfFile );
if( document.isEncrypted() )
{
StandardDecryptionMaterial spm = new StandardDecryptionMaterial(password);
document.openProtection(spm);
}
}
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException(
"Error: You do not have permission to extract images." );
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while( iter.hasNext() )
{
PDPage page = (PDPage)iter.next();
PDResources resources = page.getResources();
// extract all XObjectImages which are part of the page resources
processResources(resources, prefix, addKey);
}
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
}
private void processResources(PDResources resources, String prefix, boolean addKey) throws IOException
{
if (resources == null)
{
return;
}
Map<String, PDXObject> xobjects = resources.getXObjects();
if( xobjects != null )
{
Iterator<String> xobjectIter = xobjects.keySet().iterator();
while( xobjectIter.hasNext() )
{
String key = xobjectIter.next();
PDXObject xobject = xobjects.get( key );
// write the images
if (xobject instanceof PDXObjectImage)
{
PDXObjectImage image = (PDXObjectImage)xobject;
String name = null;
if (addKey)
{
name = getUniqueFileName( prefix + "_" + key, image.getSuffix() );
}
else
{
name = getUniqueFileName( prefix, image.getSuffix() );
}
System.out.println( "Writing image:" + name );
image.write2file( name );
}
// maybe there are more images embedded in a form object
else if (xobject instanceof PDXObjectForm)
{
PDXObjectForm xObjectForm = (PDXObjectForm)xobject;
PDResources formResources = xObjectForm.getResources();
processResources(formResources, prefix, addKey);
}
}
}
}
private String getUniqueFileName( String prefix, String suffix )
{
String uniqueName = null;
File f = null;
while( f == null || f.exists() )
{
uniqueName = prefix + "-" + imageCounter;
f = new File( uniqueName + "." + suffix );
imageCounter++;
}
return uniqueName;
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java org.apache.pdfbox.ExtractImages [OPTIONS] <PDF file>\n" +
" -password <password> Password to decrypt document\n" +
" -prefix <image-prefix> Image prefix(default to pdf name)\n" +
" -addkey add the internal image key to the file name\n" +
" -nonSeq Enables the new non-sequential parser\n" +
" <PDF file> The PDF document to use\n"
);
System.exit( 1 );
}
}
现在要裁剪图像,您可以使用:
/**
* Crop the main image according to this rectangle, and scale it to the
* correct size for a thumbnail.
*/
public InputStream cropAndScale(InputStream mainImageStream,
CropRectangle crop) {
try {
RenderedOp mainImage = loadImage(mainImageStream);
RenderedOp opaqueImage = makeImageOpaque(mainImage);
RenderedOp croppedImage = cropImage(opaqueImage, crop);
RenderedOp scaledImage = scaleImage(croppedImage);
byte[] jpegBytes = encodeAsJpeg(scaledImage);
return new ByteArrayInputStream(jpegBytes);
} catch (Exception e) {
throw new IllegalStateException("Failed to scale the image", e);
}
}
在this page和project中可用
还有其他选项可以解析pdf文件中的图像,看看at this code特别是this
关于java - 从活页乐谱中提取每一行的小节,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/14864290/
10-12 23:22