本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下

#!/user/bin/python
# -*- coding: gbk -*-
#Spider.py

import urllib2
import httplib
import StringIO
import gzip
import re
import chardet
import sys
import os
import datetime
from xml.dom.minidom import Document
from BeautifulSoup import BeautifulSoup

## 这段代码是用于解决控制台打印汉字报错的问题
reload(sys)
sys.setdefaultencoding("utf8")
#####################################################

## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志
DEBUG = 1
NO_DEBUG = 0
httplib.HTTPConnection.debuglevel = DEBUG
## 是否显示爬取网页源代码开关
showSrcCode = False
## 压缩方式
ZIP_TYPE = "gzip"

fileName = "auctions"
location = "d://spiderData/"

## header
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE}
#####################################################


#############class SpiderConfig #####################
class SpiderConfig:
 """
  configuration for spider name and url
 """
 def __init__(self, name, url):
  self.name = name
  self.url = url
#####################################################

##############class SpiderAuctionDomain##############
class SpiderAuctionDomain:
 """
  Store information with auctions spidered by python
 """
 title = ""
 url = ""
 img = ""
 price = ""

 def __init__(self):
  pass

#####################################################

########class SpiderDefaultErrorHandler##############
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
 def http_error_default(self, req, fp, code, msg, hdrs):
  """
   default error process handler for spider
  """
  result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp)
  result.status = code
  result.url = req.get_full_url()

  print "<", result.url, "Exception code :", result.status, ">"

  return result
#####################################################

#############class SpiderHandler#####################
class SpiderHandler:
 """
  spider handler
 """

 def spider(self, spiderConfig):
  try:
   request = urllib2.Request(spiderConfig.url)

   ## configure request hreader
   for key,val in headerConfig.items():
    request.add_header(key, val)

   ## build opener
   opener = urllib2.build_opener(SpiderDefaultErrorHandler())

   ## open request
   openRequest = opener.open(request)

   ## read data
   spiderData = openRequest.read()

   ## close
   opener.close()

   if 0 == len(spiderData):
    return

   if ZIP_TYPE== openRequest.headers.get("Content-Encoding"):
    spiderData = SpiderHandler.gzipData(self, spiderData)

   if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode:
    print spiderData

   # parse html
   SpiderHandler.parse(self, spiderData)

  except Exception,x:
   print "spider process Exception:", x



 def parse(self, spiderData):
  """
   parse html content
  """

  if httplib.HTTPConnection.debuglevel == DEBUG:
   charsetAnalyze = chardet.detect(spiderData)
   print "analyze spider data encode :",charsetAnalyze["encoding"]

  print "执行解析", fileName

  soup = BeautifulSoup(spiderData)
  encode = soup.originalEncoding

  encoding = lambda x : x.encode(encode)

  if httplib.HTTPConnection.debuglevel == DEBUG:
   print "识别到编码:", encode
   title = soup.head.title.string
   print encoding(title)

  spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"})
  auctions = ["%s" % s for s in spiderContents]

  if auctions is None:
   return

  auctionList = []

  for auc in auctions:
   auctionDomain = SpiderAuctionDomain()
   # parse auction link
   links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc)
   if links is not None :
    auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0]))

   #parse auction title
   titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc)
   if titles is not None:
    auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0]))

   #parse auction price
   price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc)
   if price is not None:
    auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0])

   #parse image url
   imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc)
   if imgs is not None:
    auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0])

   auctionList.append(auctionDomain)

  print "成功解析商品信息:"
  for a in auctionList:
   print "--->",a.title

  # sort auction list
  auctionList = SpiderHandler.sortAuctionList(self, auctionList)

  # save in file
  SpiderHandler.save(self, auctionList)

  print "解析完成"

  pass

 def sortAuctionList(self, auctionList):
  """
   冒泡排序,按照价格排序
  """
  length = len(auctionList)
  if length < 2:
   return auctionList
  else:
   for i in range(length-1):
    for j in range(length - i -1):
     if float(auctionList[j].price) > float(auctionList[j+1].price):
      auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j]
  return auctionList
  pass

 def save(self, auctionList):
  if auctionList is not None:
   doc = Document()

   auctions = doc.createElement("auctions")
   doc.appendChild(auctions)

   for auc in auctionList:
    auction = doc.createElement("auction")
    auctions.appendChild(auction)

    SpiderHandler.generateXML(self, doc, auction, "title", auc.title)
    SpiderHandler.generateXML(self, doc, auction, "price", auc.price)
    SpiderHandler.generateXML(self, doc, auction, "img", auc.img)
    SpiderHandler.generateXML(self, doc, auction, "link", auc.link)

   if False == os.path.exists(location):
    os.mkdir(location)

   file = open(location+fileName+".xml", 'w')
   file.write(doc.toprettyxml())
   file.close()

   if httplib.HTTPConnection.debuglevel == DEBUG:
    print doc.toprettyxml()

 def generateXML(self, doc, f, name, txt):
  c = doc.createElement(name)
  f.appendChild(c)
  c.appendChild(doc.createTextNode(txt))

 def gzipData(self, spiderData):
  """
   get data from gzip
  """
  if 0 == len(spiderData):
   return spiderData
  spiderDataStream = StringIO.StringIO(spiderData)
  spiderData = gzip.GzipFile(fileobj=spiderDataStream).read()
  return spiderData
#####################################################

if __name__ == "__main__":
 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒")

 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou",
      "hangzhou":"http://ju.taobao.com/hangzhou",
      "shanghai":"http://ju.taobao.com/shanghai",
      "beijing":"http://ju.taobao.com/beijing",
      "chengdu":"http://ju.taobao.com/chengdu"}

 configList = []
 for k,v in needSpiderUrl.items():
  spiderConfig = SpiderConfig(k, v)
  configList.append(spiderConfig)

 spiderHandler = SpiderHandler()

 print "爬虫执行开始时间:",nowtime()
 for spiderConfig in configList:
  fileName = spiderConfig.name
  spiderHandler.spider(spiderConfig)

 print "爬虫执行完毕时间:",nowtime() 

更多内容请参考专题《python爬取功能汇总》进行学习。

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持脚本之家。

02-08 21:05