北京艾丽斯妇科医院(http://fuke.fuke120.com/)
首先先说一下配置splash
1.利用pip安装scrapy-splash库
pip install scrapy-splash
2.现在就要用到另一个神器(Docker)
Docker下载地址:https://www.docker.com/community-edition#/windows
3.安装好Docker后启动Docker拉取镜像
docker pull scrapinghub/splash
4.利用Docker运行splash
docker run -p 8050:8050 scrapinghub/splash(运行之后大家可以去浏览器输入http://192.168.99.100:8050检查Docker是否正确)
5settings.py配置
SPLASH_URL = 'http://192.168.99.100:8050'(重中之重,一个大坑,一定要注意这个IP就是192.168.99.100,我就一直用的自己IP一直没运行成功)
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
} SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
} DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
ROBOTSTXT_OBEY = True(此处注意,有的网站是True,而有的网站需要把它改成False)
爬虫的py文件1.py
# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from scrapy.http import Request
# from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
import pymongo
import scrapy
from scrapy.selector import HtmlXPathSelector client = pymongo.MongoClient(host="127.0.0.1")
db = client.Health
collection = db.Healthclass # 表名classification import redis # 导入redis数据库 r = redis.Redis(host='127.0.0.1', port=6379, db=0) ii = 0
class healthcareClassSpider(scrapy.Spider):
name = "HealthCare"
allowed_domains = ["fuke120.com"] # 允许访问的域
start_urls = [
"http://fuke.fuke120.com/",
] # 每爬完一个网页会回调parse方法
def parse(self, response):
global ii
hxs = HtmlXPathSelector(response)
hx = hxs.select('//div[@id="allsort"]/div[@class="item"]/span/a')
hx1 = hxs.select('//div[@id="allsort"]/div[@class="item born"]/span/a')
# hx2 = hxs.select('//div[@id="allsort"]/div[@class="item"]/div[@class="i-mc"]/div[@class="i-mc01"]/ul[@class="w_ul01"]/li/a')
for secItem in hx:
ii+=1
url = secItem.select("@href").extract()
c = "http://fuke.fuke120.com"+url[0]
name = secItem.select("text()").extract() print(c)
print(name)
classid = collection.insert({'healthclass': name, 'pid': None})
healthurl = '%s,%s,%s' % (classid, c, ii)
r.lpush('healthclassurl',healthurl)
for secItem1 in hx1:
url = secItem1.select("@href").extract()
c1 = "http://fuke.fuke120.com"+url[0]
name1 = secItem1.select("text()").extract()
print(c1)
print(name1)
classid = collection.insert({'healthclass': name1, 'pid': None})
healthurl = '%s,%s,%s' % (classid, c1, 0)
r.lpush('healthclassurl', healthurl)
2.py
# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
import pymongo
import scrapy
from scrapy.selector import HtmlXPathSelector
from bson.objectid import ObjectId
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from urllib.request import Request,ProxyHandler
from urllib.request import build_opener
client = pymongo.MongoClient(host="127.0.0.1")
db = client.Health #库名dianping
collection = db.Diseaseclass #表名classification import redis #导入redis数据库 r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8')
class healthcareClassSpider(scrapy.Spider): name = "HealthCare1"
allowed_domains = ["fuke120.com"] # 允许访问的域
dict = {}
start_urls = [] def __init__(self):
a = r.lrange('healthclassurl', 0,-1) for item in a:
healthurl = bytes.decode(item)
arr = healthurl.split(',')
healthcareClassSpider.start_urls.append(arr[1]) num = arr[2]
pid = arr[0]
url = arr[1]
self.dict[url] = {"pid": pid, "num": num}
def parse(self, response):
nameInfo = self.dict[response.url]
pid1 = nameInfo['pid']
pid = ObjectId(pid1)
num = nameInfo['num']
hxs = HtmlXPathSelector(response)
hx = hxs.select('//div[@class="x_con02_2"]/div[@class="x_con02_3"]/ul/li/p/a')
for secItem in hx:
url = secItem.select("@href").extract()
url = "http://fuke.fuke120.com"+url[0]
name = secItem.select("text()").extract()
print(url)
print(name)
classid = collection.insert({'Diseaseclass': name, 'pid': pid})
diseaseclassurl = '%s,%s,%s' % (classid, url, pid)
r.lpush('diseaseclassurl', diseaseclassurl)
3.py
# -*- coding: utf-8 -*-
import re
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
import pymongo
import scrapy
from scrapy_splash import SplashMiddleware
from scrapy.http import Request, HtmlResponse
from scrapy_splash import SplashRequest
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from bson.objectid import ObjectId
# from diseaseHealth.diseaseHealth.spiders.SpiderJsDynamic import phantomjs1
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request client = pymongo.MongoClient(host="127.0.0.1")
db = client.Health # 库名dianping
collection = db.Treatclass # 表名classification
#
import redis # 导入redis数据库
#
r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8') class healthcareClassSpider(scrapy.Spider): name = "HealthCare2"
allowed_domains = ["fuke120.com"] # 允许访问的域
dict = {}
start_urls = [] def __init__(self):
a = r.lrange('diseaseclassurl', 0,-1) for item in a:
healthurl = bytes.decode(item)
arr = healthurl.split(',')
healthcareClassSpider.start_urls.append(arr[1]) num = arr[2]
pid = arr[0]
url = arr[1]
self.dict[url] = {"pid": pid, "num": num} def start_requests(self): for url in self.start_urls:
yield SplashRequest(url, self.parse, args={'wait': 0.5})
def parse(self, response):
# a = response.body.decode('utf-8')
# print(a) nameInfo = self.dict[response.url]
pid1 = nameInfo['pid']
pid = ObjectId(pid1)
num = nameInfo['num']
print(num)
print(pid)
hxs = HtmlXPathSelector(response)
hx = hxs.select('//div[@class="dh01"]/ul[@class="ul_bg01"]/li/a')
for secItem in hx:
url = secItem.select("@href").extract()
c = "http://fuke.fuke120.com" + url[0]
name = secItem.select("text()").extract()
print(c)
print(name)
classid = collection.insert({'Treatclass': name, 'pid': pid})
treatclassurl = '%s,%s,%s' % (classid, c, pid)
r.lpush('treatclassurl', treatclassurl)
大功告成,主要还是为了使用scrapy-splash。