为了方便使用scrapy JSON api 而将其进行包装
关于egg包的上传,则需要在本文件所处目录创建一个eggs文件夹,将egg格式文件放入即可
import requests import demjson import datetime import pandas as pd import os import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) DEFAULT_SCRAPYDIP = '127.0.0.1' DEFAULT_SCRAPYDPORT = '6800' class ScrapydApi(object): def __init__(self, ip=DEFAULT_SCRAPYDIP, port=DEFAULT_SCRAPYDPORT): self.ip = ip self.port = port self.url = 'http://' + ip + ':' + port def addversion(self, project_name, version, egg): npth = os.path.join(os.getcwd(), 'eggs') os.chdir(npth) os.system('curl %s/addversion.json -F project=%s -F version=%s -F egg=@%s' % (self.url, project_name, version, egg)) def _scheduler(self, project_name, spider_name, setting=None, jobid=None, _version=None): """ 爬虫运行 :param project_name: 项目名 :param spider_name: 爬虫名 :return: """ data = { 'project': project_name, 'spider': spider_name, 'setting': setting, 'jobid': jobid, '_version': _version } ret = requests.post('%s/schedule.json' % self.url, data=data).text result = demjson.decode(ret) return result def scheduler(self, project_name, spider_name, setting=None, jobid=None, _version=None): ret = self._scheduler(project_name, spider_name, setting, jobid, _version) self._sceduler_log(project_name, spider_name, ret) def sceduler_all(self, project_name, setting=None, jobid=None, _version=None): spiders = self._get_spiders_list(project_name)['spiders'] for i in spiders: ret = self._scheduler(project_name, i, setting, jobid, _version) self._sceduler_log(project_name, i, ret) def _sceduler_log(self, project_name, spider_name, ret): if ret['status'] == 'ok': result = '项目名称: %s 爬虫名称: %s 工作id: %s' % (project_name, spider_name, ret['jobid']) logger.info(result) else: logger.error(ret['message']) def _get_spiders_list(self, project_name): ret = requests.get('%s/listspiders.json?project=%s' % (self.url, project_name)).text json = demjson.decode(ret, encoding='GBK') if json['status'] == 'ok': return json else: logger.error(json['message']) def show_spiders_list(self, project_name): """ 获取爬虫列表 :param project_name: 项目名 :return: """ json = self._get_spiders_list(project_name) spiders = json['spiders'] if len(spiders): result = '项目名称: %s -- 爬虫列表: %s' % (project_name, ' '.join(spiders)) else: result = '爬虫列表为空' logger.info(result) def _cancel(self, project_name, jobid): data = { 'project': project_name, 'job': jobid } ret = requests.post('%s/cancel.json' % self.url, data=data).text ret = demjson.decode(ret) return ret def cancel(self, project_name, jobid): """ 关闭爬虫 :param project_name: 项目名 :param jobid: 工作id :return: """ ret = self._cancel(project_name, jobid) self._cancel_log(project_name, jobid, ret) def cancel_all(self, project_name): ret = self._get_jobs_list(project_name) if ret['status'] == 'ok': for i in ret['running']: ret = self._cancel(project_name, i['id']) self._cancel_log(project_name, i['id'], ret) else: logger.error(ret['message']) def _cancel_log(self, project_name, jobid, ret): if ret['status'] == 'ok': result = '项目名称: %s 爬虫进程: %s 被关闭 上一状态为: %s' % (project_name, jobid, ret['prevstate']) logger.info(result) else: logger.error(ret['message']) def show_version_list(self, project_name): """ 查看版本 :param project_name: 项目名 :return: """ ret = requests.get('%s/listversions.json?project=%s' % (self.url, project_name)).text json = demjson.decode(ret) if json['status'] == 'ok': version = json.get('versions') if not (len(version)): result = '不存在项目 %s' % project_name logger.error(result) else: result = '项目名称: %s -- 版本号: %s' % (project_name, version) logger.info(result) else: result = json['message'] logger.error(result) def daemonstatus(self): """ 检查服务器负载 :return: """ ret = requests.get('%s/daemonstatus.json' % self.url).text logger.info(ret) def show_projects_list(self): """ 获取项目列表 :return: """ ret = requests.get('%s/listprojects.json' % self.url).text ret = demjson.decode(ret, encoding='GBK') if len(ret): if not len(ret['projects']): ret['projects'] = '暂时还没有项目' result = '项目名称列表: %s' % ret['projects'] else: result = '爬虫列表为空' logger.info(result) def _get_jobs_list(self, project_name): ret = requests.get('%s/listjobs.json?project=%s' % (self.url, project_name)).text ret = demjson.decode(ret, encoding='GBK') return ret def show_jobs_list(self, project_name): """ 获取任务列表 :param project_name: 项目名 :return: """ ret = self._get_jobs_list(project_name) if ret['status'] == 'ok': logger.info('项目名: %s' % project_name) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('display.width', 1000) logger.info('等待中爬虫进程:') logger.info('\n%s', (pd.DataFrame(ret['pending'], columns=['id', 'spider', 'start_time', 'end_time']))) logger.info('运行中爬虫进程:') logger.info('\n%s', pd.DataFrame(ret['running'], columns=['id', 'spider', 'start_time', 'end_time'])) logger.info('已结束爬虫进程:') logger.info('\n%s', pd.DataFrame(ret['finished'], columns=['id', 'spider', 'start_time', 'end_time'])) else: logger.info(ret['message']) def del_version(self, project_name, version): """ 版本删除 :param project_name: 项目名 :param version: 版本号 :return: """ data = { 'project': project_name, 'version': version } ret = requests.post('%s/delversion.json' % self.url, data=data).text json = demjson.decode(ret) if json['status'] == 'ok': logger.info('项目: %s 版本号: %s 删除成功' % (project_name, version)) else: logger.error('访问错误 %s' % json['message']) def del_project(self, project_name): """ 项目删除 :param project_name: 项目名 :return: """ data = { 'project': project_name } ret = requests.post('%s/delproject.json' % self.url, data=data).text ret = demjson.decode(ret) if ret['status'] == 'ok': logger.info('项目: %s 删除成功' % project_name) else: logger.error('访问错误 %s' % ret['message']) if __name__ == '__main__': project_name = '' spider_name = '' sa = ScrapydApi() # sa.show_projects_list() # sa.show_version_list(project_name) # sa.show_spiders_list(project_name) # sa.scheduler(project_name, spider_name) # sa.cancel(project_name, '5822a5caefc211e9ab6300e04c68038a') # sa.show_jobs_list(project_name) # sa.del_version(project_name, 'r23') # sa.del_project(project_name) # sa.addversion(project_name, 'v0.1', 'eggs.egg') # sa.daemonstatus() # sa.sceduler_all(project_name) # sa.cancel_all(project_name)