Jsoup解析获取品花社图片
emmmm,闲着没事,想起了之前一个学长做的品花社的APP,刚好之前有了解Jsoup这个Java解析HTML的库,便花了三四个小时写了这个东西,把网站上大大小小的MM的图片都一股脑的爬到本地并做了简单的分装。
项目只是学习一下Jsoup的使用和网页解析相关知识,没其他意思。
全部的图片体积大概会是4个多G,修改代码里的MAX_PAGES_NUM变量值即可。
项目地址:https://github.com/Ganart/SpiderMM36D
关于品花社是什么,自己点进去看吧:http://www.mm36d.com/
结果看图吧:
代码在这儿:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
public class SpiderMM36D {
static class SmallImg {
String id;
String url;
}
//截止到2018年6月21,这个值最大为102,
static int MAX_APGES_NUM = 1;
static String url = "http://www.mm36d.com/home/0/";
//eg:http://www.mm36d.com/belle/0/0/id/2
static String detailUrl = "http://www.mm36d.com/belle/0/0/";
static List<String> oneMMDetailUrls = new ArrayList<>();
static List<SmallImg> allImgObjects = new ArrayList<>();
public static void main(String[] args) throws IOException {
for (int i =1; i<=MAX_APGES_NUM;i++){
gainOnePageElement(i);
}
System.out.println("完成图片链接解析,开始下载图片");
for (int i=0;i<allImgObjects.size();i++){
downloadSmallImg(allImgObjects.get(i));
System.out.println("任务进度:"+((float)(i+1)/allImgObjects.size()));
}
}
//获取小图
public static void gainOnePageElement(int index){
Document document = null;
Elements elements = null;
try {
document = Jsoup.connect(url+index).get();
if (document != null) {
elements = document.getElementsByClass("re-size1-img");
for (Element element:elements){
allImgObjects.add
(subMMImgUrl(element.getElementsByTag("img").toString()));
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
document = null;
elements = null;
}
}
//剪辑出图片的URL和id
public static SmallImg subMMImgUrl(String tag){
String head = "<img class=\"lazy\" data-original=\"";
String withoutHead = tag.substring(head.length());
String cleanUrl = withoutHead.substring(0,withoutHead.indexOf("\""));
SmallImg smallImg = new SmallImg();
smallImg.url = cleanUrl;
String id = tag.substring(tag.indexOf("(")+1,tag.indexOf(")"));
smallImg.id = id;
return smallImg;
}
//剪辑出图片的url
public static String subDetailImgUlr(String tag){
String head = "<img class=\"lazy\" data-original=\"";
String withoutHead = tag.substring(head.length());
String cleanUrl = withoutHead.substring(0,withoutHead.indexOf("\""));
return cleanUrl;
}
//下载小图
public static void downloadSmallImg(SmallImg smallImg){
if (smallImg.url.equals("")){
return;
}
try {
URL imgUrl = new URL(smallImg.url);
String url = smallImg.url;
BufferedInputStream bis = new BufferedInputStream
(imgUrl.openConnection().getInputStream());
byte[] imgArray = new byte[2048*2048];
int len = 0;
File file = new File("E:\\SpiderMM36D\\mmImgs\\"+smallImg.id);
file.mkdir();
BufferedOutputStream bos =
new BufferedOutputStream
(new FileOutputStream(file.getAbsolutePath()+"\\"
+smallImg.id+url.substring(url.lastIndexOf("."))));
while ((len=bis.read(imgArray))!=-1){
bos.write(imgArray,0,len);
}
bos.flush();
bos.close();
bis.close();
gainDetailPageElement(smallImg.id);
oneMMDetailUrls.clear();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
//下载大图
public static void downloadBigImg(String id,String picUrl,int i){
if (picUrl.equals("")){
return;
}
try {
URL imgUrl = new URL(picUrl);
BufferedInputStream bis = new BufferedInputStream
(imgUrl.openConnection().getInputStream());
byte[] imgArray = new byte[4096*4096];
int len = 0;
File file = new File("E:\\SpiderMM36D\\mmImgs\\"+id);
file.mkdir();
BufferedOutputStream bos =
new BufferedOutputStream
(new FileOutputStream
(file.getAbsolutePath()+"\\"+id+"_"+i+
picUrl.substring(picUrl.lastIndexOf("."))));
while ((len=bis.read(imgArray))!=-1){
bos.write(imgArray,0,len);
}
bos.flush();
bos.close();
bis.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
//每个MM对应的详情页
public static void gainDetailPageElement(String id){
Document document = null;
Elements elements = null;
try {
document = Jsoup.connect(detailUrl+id+"/2").get();
if (document != null) {
elements = document.getElementsByClass("re-sizemm");
for (Element element:elements){
oneMMDetailUrls.add
(subDetailImgUlr
(element.getElementsByTag
("img").toString()));
}
for (int j = 0;j<oneMMDetailUrls.size();j++){
downloadBigImg(id,oneMMDetailUrls.get(j),j);
}
}
} catch (IOException e) {
e.printStackTrace();
}finally {
document = null;
elements = null;
}
}
}