爬科学基金共享服务网中基金数据

#coding=utf-8
import json
import requests
from lxml import etree
from HTMLParser import HTMLParser
from pymongo import MongoClient data = {'pageSize':10,'currentPage':1,'fundingProject.projectNo':'','fundingProject.name':'','fundingProject.person':'','fundingProject.org':'',
'fundingProject.applyCode':'','fundingProject.grantCode':'','fundingProject.subGrantCode':'','fundingProject.helpGrantCode':'','fundingProject.keyword':'',
'fundingProject.statYear':'','checkCode':'%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81'}
url = 'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action'
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Content-Length':'',
'Content-Type':'application/x-www-form-urlencoded',
'Cookie':'JSESSIONID=8BD27CE37366ED8022B42BFC68FF82D4',
'Host':'npd.nsfc.gov.cn',
'Origin':'http://npd.nsfc.gov.cn',
'Referer':'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action',
'Upgrade-Insecure-Requests':'',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} def main():
client = MongoClient('localhost', 27017)
db = client.ScienceFund
db.authenticate("","")
collection=db.science_fund
for i in range(1, 43184):
print i
data['currentPage'] = i
result = requests.post(url, data = data, headers = headers)
html = result.text
tree = etree.HTML(html)
table = tree.xpath("//dl[@class='time_dl']")
for item in table:
content = etree.tostring(item, method='html')
content = HTMLParser().unescape(content)
# print content
bson = jiexi(content)
collection.insert(bson) def jiexi(content):
# 标题
title1 = content.find('">', 20)
title2 = content.find('</')
title = content[title1+2:title2]
# print title
# 批准号
standard_no1 = content.find(u'批准号', title2)
standard_no2 = content.find('</dd>', standard_no1)
standard_no = content[standard_no1+4:standard_no2].strip()
# print standard_no
# 项目类别
standard_type1 = content.find(u'项目类别', standard_no2)
standard_type2 = content.find('</dd>', standard_type1)
standard_type = content[standard_type1+5:standard_type2].strip()
# print standard_type
# 依托单位
supporting_institution1 = content.find(u'依托单位', standard_type2)
supporting_institution2= content.find('</dd>', supporting_institution1)
supporting_institution = content[supporting_institution1+5:supporting_institution2].strip()
# print supporting_institution
# 项目负责人
project_principal1 = content.find(u'项目负责人', supporting_institution2)
project_principal2 = content.find('</dd>', project_principal1)
project_principal = content[project_principal1+6:project_principal2].strip()
# print project_principal
# 资助经费
funds1 = content.find(u'资助经费', project_principal2)
funds2 = content.find('</dd>', funds1)
funds = content[funds1+5:funds2].strip()
# print funds
# 批准年度
year1 = content.find(u'批准年度', funds2)
year2 = content.find('</dd>', year1)
year = content[year1+5:year2].strip()
# print year
# 关键词
keywords1 = content.find(u'关键词', year2)
keywords2 = content.find('</dd>', keywords1)
keywords = content[keywords1+4:keywords2].strip()
# print keywords
dc = {}
dc['title'] = title
dc['standard_no'] = standard_no
dc['standard_type'] = standard_type
dc['supporting_institution'] = supporting_institution
dc['project_principal'] = project_principal
dc['funds'] = funds
dc['year'] = year
dc['keywords'] = keywords
return dc if __name__ == '__main__':
main()
05-15 01:55