HttpClient拉取连载小说

上午刚入手的小说，下午心血来潮想从网站上拉取下来做成电子书，呵呵，瞎折腾～说做就做～

【抓包】

这一步比什么都重要，如果找不到获取真正资源的那个请求，就什么都不用做了～

先是打算用迅雷把所有页面都下载下来然后本地处理，结果发现保存下来的页面都只有界面没有内容～看了看Javascript的代码，原来是ready的时候再ajax发送post到另一个网址取内容。

于是再抓包核实一下。抓包工具真难搞，试了两三个都没成功，最后还是用firefox搞定了～打开页面共发送了50个请求，不过post只有两个，很快就看到http包的内容了。

HttpClient拉取连载小说-LMLPHP

【写程序拉取】

网址，请求的header，表单都具备了，还等什么，赶紧码字爬取啦～本来还担心要伪装浏览器，要填入cookies内容，调试起来发现是想太多了，直奔网址带上表单就够了～

HttpClient的用法是现炒现卖，官方example的QuickStart.java就够很清晰了。再有就是debug进去看请求和响应。

用法见HttpPost和crawlOnePage(HttpRequestBase)。MyFileWriter就不贴出来献丑了，反正就是个I/O。

遇到并解决的问题：

1.响应回来的资源是gzip压缩过的，要用对应的类去解码；

2.网址序列号中个别缺页，通过判断响应里的状态码跳过即可。

（全文完，以下是代码。）

 package mycrawl;

 import java.io.IOException;

 import java.util.ArrayList;

 import java.util.List;

 import org.apache.http.HttpEntity;

 import org.apache.http.NameValuePair;

 import org.apache.http.client.ClientProtocolException;

 import org.apache.http.client.entity.GzipDecompressingEntity;

 import org.apache.http.client.entity.UrlEncodedFormEntity;

 import org.apache.http.client.methods.CloseableHttpResponse;

 import org.apache.http.client.methods.HttpPost;

 import org.apache.http.client.methods.HttpRequestBase;

 import org.apache.http.impl.client.CloseableHttpClient;

 import org.apache.http.impl.client.HttpClients;

 import org.apache.http.message.BasicNameValuePair;

 import org.apache.http.util.EntityUtils;

 import crawl.common.MyFileWriter;

 public class MyCrawl {

     private static CloseableHttpClient httpclient = HttpClients.createDefault();

     /**

      * (1)建立post对象，包括网址和表单 (2)循环抓取每页并处理输出

      *

      * @param args

      * @throws IOException

      */

     public static void main(String[] args) throws IOException {

         final int startChapter = 页面序列号;

         final int endChapter = 页面序列号;
         final Integer bookId = bookid;

         String outPattern = "c:\\book*.txt";

         MyFileWriter fw = new MyFileWriter(outPattern);

         // 创建post操作

         HttpPost httpPost = new HttpPost("网址");

         List<NameValuePair> nvps = new ArrayList<NameValuePair>(2);

         try {

             // post的表单内容

             nvps.add(new BasicNameValuePair("b", bookId.toString()));

             nvps.add(new BasicNameValuePair("c", "placeholder"));

             for (Integer i = startChapter, j = 0; i <= endChapter; i++, j++) {

                 // 循环抓取连续章节

                 nvps.set(1, new BasicNameValuePair("c", i.toString()));

                 httpPost.setEntity(new UrlEncodedFormEntity(nvps));

                 String outStr = MyCrawl.crawlOnePage(httpPost);

                 if (outStr == null || outStr.isEmpty()) {

                     j--;

                     continue;

                 }

                 // 处理章节标题，懒得去抓取标题页了。

                 outStr = "====== " + MyCrawl.chapterArr[j] + "\r\n"

                         + MyCrawl.prettyTxt(outStr);

                 // System.out.println(outStr);

                 fw.rollingAppend(outStr);

             }

             fw.getFileWriter().flush();

             fw.getFileWriter().close();

             System.out.println("已完成");

         } finally {

             httpclient.close();

         }

     }

     /**

      * 抓取单页

      *

      * @param req

      * @return result

      * @throws ClientProtocolException

      * @throws IOException

      */

     public static String crawlOnePage(HttpRequestBase req)

             throws ClientProtocolException, IOException {

         String result;

         CloseableHttpResponse resp = httpclient.execute(req);

         // 处理返回码

         int status = resp.getStatusLine().getStatusCode();

         if (status < 200 || status >= 300) {

             System.out.println("[Error] " + resp.getStatusLine().toString());

             return "";

         } else if (status != 200) {

             System.out.println("[Warn] " + resp.getStatusLine().toString());

             return "";

         }

         HttpEntity entity = resp.getEntity();

         if (entity instanceof GzipDecompressingEntity) {

             // 解压缩内容

             GzipDecompressingEntity gEntity = (GzipDecompressingEntity) entity;

             result = EntityUtils.toString(gEntity);

         } else {

             result = EntityUtils.toString(entity);

         }

         EntityUtils.consume(entity);

         resp.close();

         return result;

     }

     /**

      * 处理换行等特殊字符

      *

      * @param txt

      * @return string

      */

     public static String prettyTxt(String txt) {

         if (txt == null || txt.isEmpty()) {

             return "";

         }

         int contentStart = txt.indexOf("content") + 10;

         int contentEnd = txt.indexOf("　　<br/><br/>　　\",\"next");

         txt = txt.substring(contentStart, contentEnd);

         return txt.replace("<br/><br/>", "\r\n");

     }

     // 章节标题

     private static final String[] chapterArr = new String[] { "第一章",

             "第二章" };

 }