将数据以MySQL存储,字段名包括:公司名、新闻标题、网址、新闻来源和时间。
import time import pymysql import requests from bs4 import BeautifulSoup from requests import RequestException def get_one_page(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' + '(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'} response = requests.get(url, headers=headers) #response.encoding = response.apparent_encoding if response.status_code == 200: return response.text else: return None return None except RequestException: return None def parse_one_page(c, text): soup = BeautifulSoup(text, 'lxml') titles = soup.select('.c-title > a') sources = soup.find_all(name='p', class_='c-author') companys = ['阿里巴巴','京东','亚马逊','华为','贵州茅台'] for i in range(10): data = { 'company': companys[c], 'title': titles[i].get_text().strip(), 'link': titles[i]['href'], 'source': sources[i].get_text().strip().split('\xa0')[0].strip(), 'time': sources[i].get_text().strip().split('\xa0')[2].strip() } yield data def create_sql(): db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders') cursor = db.cursor() sql=("CREATE TABLE baidu (company VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,link VARCHAR(255) NOT NULL,source VARCHAR(255) NOT NULL,time VARCHAR(255) NOT NULL)") cursor.execute(sql) db.close() def write_to_sql(data): table = 'baidu' keys = ', '.join(data.keys()) values = ', '.join(['%s'] * len(data)) sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values) db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders') cursor = db.cursor() try: if cursor.execute(sql, tuple(data.values())): print('Successful') db.commit() except: print('Failed') db.rollback() db.close() def main(c, url): for pn in range(0, 91, 10): link = url + '&x_bfe_rqs=03E80&tngroupname=organic_news&rsv_dl=news_b_pn&pn=' + str(pn) text = get_one_page(url) for item in parse_one_page(c, text): print(item) write_to_sql(item) if __name__ == '__main__': create_sql() companys = ['阿里巴巴','京东','亚马逊','华为','贵州茅台'] url = "https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd={}&medium=0" urls = [url.format(com) for com in companys] for c,url in enumerate(urls): main(c, url) time.sleep(1)