#!usr/bin/python
# -*- coding:utf-8 -*-
import urllib2 import re import MySQLdb class BaiDuNews: def __init__(self):
self.baseurl = 'http://news.baidu.com/' def getPage(self):
request = urllib2.Request(self.baseurl)
response = urllib2.urlopen(request)
# print response.read()
return response.read().decode('gbk') def getContents(self,page):
pattern = re.compile('<li class="hd.*?<a.*?>(.*?)</a>', re.S)
items = re.findall(pattern, page)
contents = []
for item in items:
print item
contents.append(item.encode('utf-8'))
return contents def saveDB(self, contents):
db = MySQLdb.connect(host='127.0.0.1',user='root',passwd='',db='test',charset='utf8')
cur = db.cursor()
# sql = 'CREATE TABLE baidunews (`id` INT NOT NULL PRIMARY ,`text` VARCHAR(255))'
# cur.execute(sql)
sql2 = """INSERT INTO baidunews VALUES (NULL ,"%s")"""
for content in contents:
cur.execute(sql2 % (content))
cur.close()
db.commit()
db.close() news = BaiDuNews()
news.saveDB(news.getContents(news.getPage()))
04-15 10:47