import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map; import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; /**
* grid2008的代码,解析了优先出版
*
*/ public class GetCkTest20082 {
//定义cat标记 注意大写。你懂的
static String cat="CJFQ";
static String sKuakuID = "";
public static void main(String[] args) { //列表url Map<String,String> cookies = getCookie(cat); if(cookies.size() >0){
//列表
String listUrl = "http://epub.cnki.net/grid2008/brief/brief.aspx?pagename=asp.brief_result_aspx&dbprefix=scdb&skuakuid="+sKuakuID+
"&loadgroup=1&prio=true&stab=normal&turnpage=1&recordsperpage=20&queryid="+sKuakuID+"&id=&curpage=3"; //文章最初链接
List<String> articleInitUrls = new ArrayList<String>();
Connection conn = Jsoup.connect(listUrl);
conn.method(Method.GET);
conn.followRedirects(false);
conn.timeout(5000);
conn.cookies(cookies);
try {
Document doc = conn.get();
Elements links = doc.select("a[target=NewBriefDetail]");
if(links.size() <= 0){
System.out.println("没有更多文章。");
}else{
for(Element link : links){
articleInitUrls.add(link.attr("abs:href"));
//System.out.println(articleInitUrls);
}
} } catch (IOException e) {
System.out.println("链接超时了。。");
} if(articleInitUrls.size()<=0){
System.out.println("没有文章!");
}
for(String articleInitUrl : articleInitUrls){ Connection conn2 = Jsoup.connect(articleInitUrl); conn2.header("Referer", listUrl);
conn2.cookies(cookies);
conn2.followRedirects(false);
try {
Document doc = conn2.get();
Elements links = doc.select("h2 > a[href]");
//System.out.println(links);
if(links.size()<=0){
System.out.println("最初链接为:【"+articleInitUrl+"】的文章获取实际链接失败!");
}else{ String url1 = links.toString();
//System.out.println(url1.substring(url1.indexOf("detail%252f")+11,url1.indexOf(".html")));
String url2=url1.substring(url1.indexOf("detail%252f")+11,url1.indexOf(".html")); //String articleUrl = links.get(0).attr("href"); String articleUrl="www.cnki.net/kcms/detail/"+url2+".html";
System.out.println(articleUrl);
//getArticle(articleUrl);
}
} catch (IOException e) {
System.out.println("最初链接为:【"+articleInitUrl+"】的文章链接超时!");
}
}
} } public static Map<String,String> getCookie(String cat) { String listUrl = "http://epub.cnki.net/grid2008/brief/Result.aspx";
//检索 String searchHander = "http://epub.cnki.net/grid2008/request/search.aspx?PageName=ASP.brief_result_aspx&DBViewType=FullText";
Connection conn = Jsoup.connect(listUrl);
conn.method(Method.GET);
conn.followRedirects(false);
conn.timeout(5000);
try {
Document doc = conn.get();
String db_opt = doc.select("input#db_opt").attr("value");
String db_prefix = doc.select("input#db_prefix").attr("value");
String db_configfile = doc.select("input#db_configfile").attr("value"); String searchHanderUrl = searchHander+"&DbCatalog="+db_opt+"&DbPrefix="+db_prefix+"&ConfigFile="+db_configfile;
Connection conn2 = Jsoup.connect(searchHanderUrl);
conn2.method(Method.GET);
conn2.followRedirects(false);
conn2.timeout(5000);
Response response;
Document doc2 = conn2.get();
String responseContent = doc2.select("body").text();
if(responseContent.indexOf("sKuakuID") !=-1){
System.out.println(responseContent.substring(responseContent.indexOf("sKuakuID")+9));
sKuakuID = responseContent.substring(responseContent.indexOf("sKuakuID")+9);
}
response = conn2.response();
return response.cookies();
} catch (IOException e) {
System.out.println("获取cookies的链接超时了。你懂的!");
return new HashMap<String,String>();
} } public static void getArticle(String articleUrl) {
Connection conn = Jsoup.connect(articleUrl);
conn.method(Method.GET);
conn.followRedirects(false);
conn.timeout(5000);
try {
Document doc = conn.get();
//这里只打印标题了。
Elements links = doc.select("span#chTitle");
System.out.println("文章标题:"+links.get(0).text()+"——链接:【"+articleUrl+"】");
} catch (IOException e) {
System.out.println("链接文章:【"+articleUrl+"】超时了。");
} } }