java爬虫遇到个页面加密的东西,找了些资料学习学习

做了个java运行js的工具类,希望对大家有用,其中用到client(获取js)可以自行换成自己的client。主要是用了

Rhino就是JavaScript引擎,它的目的就是实现Java与JavaScript的互操作性。rhino-1.7R1.jar

Envjs一个纯js方式在无浏览器环境下模拟浏览器的行为。envjs-1.2.js

一般网站js中都会用到jauery,所以还用了jauery.js

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.lang.ref.SoftReference;
import java.net.URI;
import java.nio.charset.Charset;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.Validate;
import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.RequestAcceptEncoding;
import org.apache.http.impl.DefaultConnectionReuseStrategy;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.Args;
import org.apache.http.util.ByteArrayBuffer;
import org.jsoup.Jsoup;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.ContextFactory;
import org.mozilla.javascript.Function;
import org.mozilla.javascript.Scriptable; import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch; //import net.sourceforge.htmlunit.corejs.javascript.Context;
//import net.sourceforge.htmlunit.corejs.javascript.ContextFactory;
//import net.sourceforge.htmlunit.corejs.javascript.Function;
//import net.sourceforge.htmlunit.corejs.javascript.Scriptable; /**
* 参照http://mybeautiful.iteye.com/blog/1442839
* http://m.oschina.net/blog/121347
* http://blog.csdn.net/dwjmantou/article/details/45276967
* http://lcllcl987.iteye.com/blog/87423
* ***不可使用htmlunit的包******Cannot call method "setOptimizationLevel" of null
* @author 5432
*
*/
public class RhinoScaper {
private Context context;
private Scriptable scriptable;
/**
* 初始化方法
*/
public void init(){
context = ContextFactory.getGlobal().enterContext();
scriptable =context.initStandardObjects(null);
context.setOptimizationLevel(-1);
context.setLanguageVersion(Context.VERSION_1_5);
// 初始化测试用,并定义envjs-1.2.js未定义print
context.evaluateString(scriptable,
"var v='sssaass';"
+ "var print = function(v) {"+
" java.lang.System.out.println(v);return v ;"+
" };function hah(){return v }",
"print",1,null);
// System.out.println("v == " + scriptable.get("v", scriptable) );
Function prf = (Function)scriptable.get("print", scriptable);
Object call = prf.call(Context.getCurrentContext(), scriptable, prf, new Object[]{"test"});
// System.out.println("print == "+call.toString());
Object invokFunction = invokFunction("hah");
// System.out.println(invokFunction.toString()); String[] file = { this.getClass().getResource("/")+"envjs-1.2.js", "./lib/jquery.js" };
for (String f : file) {
evaluateJs(f);
}
}
/**
* 调用函数
* @param functionName
* @param functionArags
* @return
*/
public Object invokFunction(String functionName,Object... functionArags) {
Validate.notNull(context, "context is null");
Validate.notNull(scriptable, "scriptable is null");
Function function = (Function) scriptable.get(functionName, scriptable);
Object call = function.call(Context.getCurrentContext(), scriptable, function, functionArags);
// System.out.println("reslult = "+call.toString());
return call;
} /**
* 加载js文件
* (当没有找到对应文件,
* 且要加载文件名路径包含‘envjs-1.2.js’ 会访问 https://raw.githubusercontent.com/ryan-roemer/envjs-1.2/master/env.rhino.1.2.js
* 文件名路径包含‘jquery.js’ 会访问 http://apps.bdimg.com/libs/jquery/1.6.0/jquery.js
* 加载js文件 )
* @param f 文件名路径
*/
public void evaluateJs(String f) {
Validate.notNull(context, "context is null");
Validate.notNull(scriptable, "scriptable is null");
FileReader in = null;
try {
// FileInputStream fI = new FileInputStream(f);
// String js = IOUtils.toString(fI, "UTF-8");//设置默认js文件编码为utf-8
// context.evaluateString(scriptable, js, f, 1, null);
in = new FileReader(f);
context.evaluateReader(scriptable, in, f, 1, null);
} catch (FileNotFoundException e1) {
// e1.printStackTrace();
if (f.contains("envjs-1.2.js")) {
String envjs ="https://raw.githubusercontent.com/ryan-roemer/envjs-1.2/master/env.rhino.1.2.js";
try {
SoftReference<String> htmlString = Client.getHtmlString(envjs);
String jqueryStr = htmlString==null?"":htmlString.get();
// DefaultClient defaultClient = new DefaultClient();
// String jqueryStr =defaultClient.get(envjs).asHtml();
context.evaluateString(scriptable, jqueryStr, envjs, 1, null);
} catch (Exception e) {
e.printStackTrace();
}
} else if (f.contains("jquery.js")) {
String jquery = "http://apps.bdimg.com/libs/jquery/1.6.0/jquery.js";
Reader bufR =null;
try {
SoftReference<Reader> htmlReader = Client.getHtmlReader(jquery);
bufR = htmlReader==null?new BufferedReader(null):htmlReader.get();
// String js = IOUtils.toString(bufR);
context.evaluateReader(scriptable, bufR , jquery, 1, null);
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}finally {
// close(bufR);
IOUtils.closeQuietly(bufR);
}
} else{
throw new RuntimeException("unknown file "+f);
}
} catch (IOException e1) {
e1.printStackTrace();
}finally {
// close(in);
IOUtils.closeQuietly(in);
}
} public static void main(String[] args) {
RhinoScaper rhinoScaper = new RhinoScaper();
rhinoScaper.init();
// rhinoScaper.JSloadString("jsString", "jsname");
// rhinoScaper.evaluateJs("E:/Desktop/loginjs.js");
// rhinoScaper.loadJS("", classpathURI); // 电信登录加密测试
String pwd="111";
StringBuilder ascending = new StringBuilder();
SoftReference<String> htmlString = null;
try {
htmlString = Client.getHtmlString("http://login.189.cn/bundles/jquery?v=h3Pl8XT8zdNkoI1VbV5sEZOBrSqsxRXX0TIQ9S_lAlM1");
} catch (Exception e) {
e.printStackTrace();
}
String jsStr =htmlString==null?"":htmlString.get();
jsStr = jsStr.replaceAll("float:", "floats:").replaceAll("throws", "throwss");
ascending.append(jsStr);
ascending.append(";\n var input=document.createElement(\"input\");input.value='"+pwd+"';;input.id= 'pass';input.type='password';");
ascending.append("\n function getpassword(){ return $(input).valAesEncryptSet()}");
rhinoScaper.JSloadString(ascending.toString(), "jsname");
Object result = rhinoScaper.invokFunction("getpassword");
System.out.println(result);
try {
htmlString = Client.getHtmlString("http://www.youdaili.net/Daili/");
jsStr =htmlString==null?"":htmlString.get();
String runScript = rhinoScaper.runScript(jsStr);
System.out.println(runScript);
} catch (Exception e) {
e.printStackTrace();
} }
/**
* 运行js
* @param html
* @return
*/
private String runScript(String html) {
String function = null;int jsfrom = 0;
Pattern p = Pattern.compile("setTimeout\\(\"(.*)\\((.*)\\)\", 200\\);");
Matcher m = p.matcher(html);
if(m.find()){
function = m.group(1);//函数名
jsfrom = Integer.parseInt(m.group(2));//参数
}
JSloadString(Jsoup.parse(html).select("script").html().replace("eval(\"qo=eval;qo(po);\")", "return po"), "jsname");
Object result = invokFunction(function, jsfrom);
return result.toString();
}
/**
* 加载js文件
* @param sourceName 名称
* @param classpathURI 文件路径
*/
public void loadJS(String sourceName, String classpathURI) {
Validate.notNull(context, "context is null");
Validate.notNull(scriptable, "scriptable is null");
String js = null;
InputStream inputStream = null;
try {
inputStream = getClass().getResourceAsStream(classpathURI);
js = IOUtils.toString(inputStream, "UTF-8");//设置默认js文件编码为utf-8
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeQuietly(inputStream);
}
context.evaluateString(scriptable, js, sourceName, 1, null);
}
/**
* 加载js字符串
* @param source js字符串(注意处理js中由于变量名为throws,float类似名称导致的报错)
* @param sourceName 名称
*/
public void JSloadString(String source, String sourceName){
Validate.notNull(context, "context is null");
Validate.notNull(scriptable, "scriptable is null");
context.evaluateString(scriptable, source, sourceName, 1, null);
}
}
class Client{
public static void close(AutoCloseable close) {
if (close != null) {
try {
close.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static CloseableHttpResponse HttpGetResponse(String url) throws IOException, ClientProtocolException {
HttpGet httpGet = new HttpGet(URI.create(url));
BasicCookieStore cookieStore = new BasicCookieStore();
HttpClientBuilder builder = HttpClientBuilder.create().disableContentCompression()
.setConnectionReuseStrategy(new DefaultConnectionReuseStrategy()).setUserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36");
builder.addInterceptorLast(new RequestAcceptEncoding());
builder.setDefaultCookieStore(cookieStore);
CloseableHttpClient client = builder.build();
CloseableHttpResponse execute = client.execute(httpGet);
return execute;
}
public static SoftReference<String> getHtmlString(String url)throws Exception {
CloseableHttpResponse execute = null;
byte[] binary =null;//初次解析内容
SoftReference<String> result = null;
try {
execute = HttpGetResponse(url);
// content = execute.getEntity().getContent();
binary = HttpEntityTOByte(execute.getEntity());
}finally {
close(execute);
}
String html;
byte[] decode;
try {
System.out.println(execute.getStatusLine().toString());
System.out.println(execute.getEntity().getContentEncoding()); Args.notNull(binary, "binary");
decode= decode(binary,execute.getEntity());
try {
String charset = getContentCharSet(execute.getEntity().getContentType().getValue());
if (charset != null) {
html = new String(decode, Charset.forName(charset));
} else {
CharsetMatch match = new CharsetDetector().setText(decode)
.detect();
html = match.getString();
}
} catch (Exception e) {
throw new Exception(e);
}
result = new SoftReference<String>(html);
}finally {
binary =null;
decode =null;
html=null;
}
return result; }
public static SoftReference<Reader> getHtmlReader(String url)throws Exception {
CloseableHttpResponse execute = null;
byte[] binary =null;//初次解析内容
SoftReference<Reader> result = null;
try {
execute = HttpGetResponse(url);
binary = HttpEntityTOByte(execute.getEntity());
}finally {
close(execute);
}
byte[] decode;
Reader bufR = null;
try {
System.out.println(execute.getStatusLine().toString());
System.out.println(execute.getEntity().getContentEncoding().toString());
Args.notNull(binary, "binary");
decode= decode(binary,execute.getEntity());
bufR= new BufferedReader(new InputStreamReader(new ByteArrayInputStream(decode)));
result=new SoftReference<Reader>(bufR);
}finally {
binary =null;
decode =null;
//close(bufR);
}
return result; }
private static String getContentCharSet(String contentType) throws ParseException {
String charset = null;
if (StringUtils.isNotEmpty(contentType)) {
String[] strs = contentType.split(";");
for (String string : strs) {
if (string.contains("charset")) {
String[] tmp = string.split("=");
if (tmp.length == 2) {
return tmp[1];
}
}
}
}
return charset;
}
public static final int BUFFER = 1024;
/**
* 数据解压缩 gizp
*
* @param data
* @return
* @throws Exception
* @author http://snowolf.iteye.com/blog/643010
*/
public static byte[] decompress(byte[] data) throws Exception {
ByteArrayInputStream bais = new ByteArrayInputStream(data);
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// 解压缩
decompress(bais, baos);
data = baos.toByteArray();
baos.flush();
close(baos);
close(bais);
// baos.close();
// bais.close();
return data;
}
/**
* 数据解压缩
*
* @param is
* @param os
* @throws Exception
*/
public static void decompress(InputStream is, OutputStream os)
throws Exception {
GZIPInputStream gis =null;
byte data[];
try {
gis = new GZIPInputStream(is);
int count;
data = new byte[BUFFER];
while ((count = gis.read(data, 0, BUFFER)) != -1) {
os.write(data, 0, count);
}
} finally{
data = null;
close(gis);
// gis.close();
}
} /**
* gizp解压
* @param binary
* @param res
* @param entity
* @return
* @throws Exception
*
*/
public static byte[] decode(byte[] binary, final HttpEntity entity) throws Exception {
if (entity != null && entity.getContentLength() != 0) {
final Header ceheader = entity.getContentEncoding();
if (ceheader != null) {
final HeaderElement[] codecs = ceheader.getElements();
for (final HeaderElement codec : codecs) {
final String codecname = codec.getName().toLowerCase(Locale.US);
if ("gzip".equals(codecname) || "x-gzip".equals(codecname)) {
return decompress(binary);
} else if ("deflate".equals(codecname)) {
return binary;
} else if ("identity".equals(codecname)) { /* Don't need to transform the content - no-op */
return binary;
} else {
throw new Exception("Unsupported Content-Coding: "+codecname );
}
}
}
}
return binary;
}
/**
* 将HttpEntity转换成byte数组
* @param entity HttpEntity
* @return byte[]
* @throws IOException
   * @author EntityUtils.toByteArray(entity)
*/
public static byte[] HttpEntityTOByte(HttpEntity entity) throws IOException{
final InputStream instream = entity.getContent();
if (instream == null) {
return null;
}
try {
Args.check(entity.getContentLength() <= Integer.MAX_VALUE,
"HTTP entity too large to be buffered in memory");
int i = (int)entity.getContentLength();
if (i < 0) {
i = 4096;
}
final ByteArrayBuffer buffer = new ByteArrayBuffer(i);
final byte[] tmp = new byte[4096];
int l;
while((l = instream.read(tmp)) != -1) {
buffer.append(tmp, 0, l);
}
return buffer.toByteArray();
} finally {
instream.close();
}
}
}
04-20 13:22