蜘蛛页面
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', , 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(, ))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(, ))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[:] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > :
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > : sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT min_id FROM ( SELECT MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) while True:
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 ORDER BY id DESC '
res = mysql_fetch(sql_ori, 'dic')
for d in res:
page_url, children_url = d['page_url'], d['children_url']
url = children_url
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep()
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(, ))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[:] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > :
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > :
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT min_id FROM ( SELECT MIN(id) AS min_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep() dd =
CREATE TABLE `parent_url` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`page_title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci DEFAULT NULL,
`page_url` text,
`children_url` text,
`if_spider` tinyint(4) DEFAULT '0',
PRIMARY KEY (`id`)
) ENGINE=MyISAM AUTO_INCREMENT=5328 DEFAULT CHARSET=latin1;
先写入,后删除
避免每个写入前的检查
消耗时间
获取一个网站的全部url
修复逻辑错误
支持 多进程 脚本多开
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/']
while True: sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
mysql_write(sql_filter) sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0'
res = mysql_fetch(sql_ori, 'dic')
jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url continue_ = False
for fl in url_kw_filter_l:
if fl in url:
continue_ = True
break
if continue_:
continue js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep(1)
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del)
sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep(3) dd = 0
代码的每一个功能点的模块化
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);"
print(sql_filter)
sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter) sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0' res = mysql_fetch(sql_ori, 'dic')
jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
time.sleep(1)
browser.refresh()
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del)
sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp)
time.sleep(3) dd = 0
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter)
print(sql_filter)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url'
url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')]
# 乐观代码
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format(
','.join(url_pass)) res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# time.sleep(1)
# browser.refresh() try:
for isc in range(1):
time.sleep(1)
js = 'window.scrollTo(0,document.body.scrollHeight)'
browser.execute_script(js)
except Exception as e:
print('window.scrollTo-->', e) myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue continue_ = False
for fi in url_kw_filter_l:
ii = fi.upper()
if fi in i.upper():
continue_ = True
break
if continue_:
continue res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp) dd = 0
from selenium import webdriver
import time
import random
from bs4 import *
import pymysql h, pt, u, p, db = 'localhost', 3306, 'root', 'root', 'test' def mysql_fetch(sql, res_type='tuple'):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return ()
if res_type == 'dic':
cursor = conn.cursor(pymysql.cursors.DictCursor)
else: cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return cursor.fetchall() def mysql_write(sql):
global h, pt, u, p, db
try:
conn = pymysql.connect(host=h, port=pt, user=u, passwd=p, db=db, charset='utf8')
except Exception as e:
print(e)
return 1
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
return 0 browser = webdriver.Chrome()
f_url_l = ['https://www.baidu.com/', 'https://www.so.com/']
f_url_l_a = f_url_l[int(time.time()) % len(f_url_l)]
browser.get(f_url_l_a)
time.sleep(random.randint(1, 2))
url = 'https://so.gushiwen.org/shiwenv_5a36a4613434.aspx'
js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# img_url = 'https://img.gushiwen.org/authorImg/daishulun.jpg'
myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
url_l = [i.attrs['href'] for i in bs.find_all('a')]
res_l = []
sql_l = []
for i in url_l:
break
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# r = mysql_fetch(sql_chk)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue
res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) import random # https://www.gushiwen.org/FileNotFound.htm?aspxerrorpath=/user/findpwd.aspx
# https://so.gushiwen.org/user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
# https://so.gushiwen.org/app/
# https://so.gushiwen.org/jiucuo.aspx?u= url_kw_filter_l = ['FileNotFound', 'findpwd', '/app/', '/jiucuo.aspx']
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE INSTR(page_title,'密码')>0 OR INSTR(UPPER(page_url),'PWD')>0 OR INSTR(UPPER(children_url),'PWD')>0) AS t);"
sql_filter = "DELETE FROM parent_url WHERE id IN ( SELECT id FROM ( SELECT id FROM parent_url WHERE MYWHERE ) AS t);" sql_s_l = []
for i in url_kw_filter_l:
ii = i.upper()
s = " INSTR(UPPER(page_url),'{}')>0 OR INSTR(UPPER(children_url),'{}')>0 ".format(ii, ii)
sql_s_l.append(s)
sql_filter = sql_filter.replace('MYWHERE', ' OR '.join(sql_s_l)) while True:
mysql_write(sql_filter)
print(sql_filter)
sql_del = 'DELETE FROM parent_url WHERE id IN ( SELECT max_id FROM ( SELECT MAX(id) AS max_id, COUNT(1) AS c FROM parent_url GROUP BY CONCAT(page_url,children_url) ) AS tab WHERE c>1 )'
print(sql_del)
mysql_write(sql_del) sql_pass = 'SELECT DISTINCT(page_url) FROM parent_url'
url_pass = ['"{}"'.format(i[0]) for i in mysql_fetch(sql_pass, res_type='tuple')]
# 乐观代码
sql_ori = 'SELECT page_url,children_url FROM parent_url WHERE if_spider=0 AND children_url NOT IN ({})'.format(
','.join(url_pass)) res = mysql_fetch(sql_ori, 'dic') jump_c, jump_s = 0, random.randint(0, max(0, len(res) - 10))
for d in res:
jump_c += 1
if jump_c < jump_s:
continue
page_url, children_url = d['page_url'], d['children_url']
url = children_url js = 'window.location.href="{}";'.format(url)
browser.execute_script(js)
# time.sleep(1)
# browser.refresh() try:
for isc in range(1):
time.sleep(1)
js = 'window.scrollTo(0,document.body.scrollHeight)'
browser.execute_script(js)
except Exception as e:
print('window.scrollTo-->', e) myhtml = 'D:\\myhtml\\{}gushiwen.tmp.html'.format(random.randint(123, 999))
with open(myhtml, 'w', encoding='utf-8') as fw:
fw.write(browser.page_source)
sql = 'INSERT INTO parent_url (page_title,page_url,children_url) VALUES '
with open(myhtml, 'r', encoding='utf-8') as myhtml_o:
bs = BeautifulSoup(myhtml_o, 'html.parser')
try:
url_l = [i.attrs['href'] for i in bs.find_all('a')]
except Exception as e:
print(e)
continue
res_l = []
sql_l = []
for i in url_l:
# /user/findpwd.aspx?from=http://so.gushiwen.org/user/collect.aspx
if 'gushiwen.org' in i and 'javascript' not in i and 'http' in i and i[0:4] == 'http':
# sql_chk = 'SELECT * FROM parent_url WHERE page_url="{}" AND children_url="{}"'.format(url, i)
# print(sql_chk)
# r = mysql_fetch(sql_chk)
# print(r)
# if len(r) > 0:
# continue
if i not in res_l:
if i == url:
continue continue_ = False
for fi in url_kw_filter_l:
ii = fi.upper()
if fi in i.upper():
continue_ = True
break
if continue_:
continue res_l.append(i)
s = '("{}","{}","{}")'.format(browser.title, url, i)
# sql_break = '{}{}'.format(sql, s)
# print(sql_break)
# mysql_write(sql_break) # print(s)
sql_l.append(s)
if len(sql_l) > 0:
sql = '{}{}'.format(sql, ','.join(sql_l))
print(sql)
mysql_write(sql) sql_udp = 'UPDATE parent_url SET if_spider=1 WHERE page_url="{}" AND children_url="{}"'.format(page_url,
url)
mysql_write(sql_udp)
print(sql_udp) dd = 0
(父,子)url有序二元组