spider from mobile to mobile to mobile
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys tag_jmtool_list = ['(', '(', '-'] ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
for i in uafile:
if i.find('Mozilla') > -1:
ua_list.append(i.replace('\n', '').strip()) ua_list_len_ = len(ua_list) - 1 def extract_name(name_):
for i in tag_jmtool_list:
name_ = name_.split(i)[0]
return name_ target_type_list = ['住宅小区', '写字楼']
target_type_list = ['住宅小区']
target_dic = {}
with open('JMTool0819am/任务JMTool.csv', 'r', encoding='utf-8') as csvfile:
for i in csvfile:
l = i.replace(' ', '').replace('\n', '').split('";"')
if l[0].replace('"', '') in target_type_list:
type_, city, district, addr, name_ = l
type_, name_ = type_.replace('"', ''), name_.replace('"', '')
name_reduction = extract_name(name_) if city not in target_dic:
target_dic[city] = {}
if district not in target_dic[city]:
target_dic[city][district] = {}
if type_ not in target_dic[city][district]:
target_dic[city][district][type_] = {}
if name_reduction not in target_dic[city][district]:
target_dic[city][district][type_][name_reduction] = {}
target_dic[city][district][type_][name_reduction]['name_reduction_list'] = []
target_dic[city][district][type_][name_reduction]['history_list'] = [] target_dic[city][district][type_][name_reduction]['name_reduction_list'].append(name_)
target_dic[city][district][type_][name_reduction]['history_list'].append(l) def write_res_html(browser, dir_='baidu_map_html/'):
current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
page_source = '%s%s' % (current_url_, browser.page_source)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
fo = open(file_name, 'w', encoding='utf-8')
fo.write(page_source)
fo.closed def gen_random_letter():
return chr(random.randint(97, 122)) def gen_random_num():
return random.randint(0, 10) def gen_sougo_pid():
res_ = ''
for i in range(1, 17, 1):
if i in [1, 3, 4, 15]:
res_ = '%s%s' % (res_, gen_random_letter())
else:
res_ = '%s%s' % (res_, gen_random_num())
return res_ def close_alert(browser, attitude='accept'):
try:
sleep(2)
al = browser.switch_to.alert()
sleep(1)
if attitude == 'accept':
al.accept()
elif attitude == 'dismiss':
al.dismiss()
print(sys._getframe().f_lineno, 'alert-closed-ok')
except Exception:
print(sys._getframe().f_lineno, Exception, 'no-alert') # input_ = '深圳市南山区荟芳园' def mobile_mobile_pages_html(input_):
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
mobile_emulation = {
"deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}} mobile_emulation['userAgent'] = ua_list[ua_list_index]
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options) url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-%s-0007&keyword=百度地图' % (gen_sougo_pid())
print(url_seed)
browser.get(url_seed)
js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
browser.execute_script(js)
xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp_newpage).click()
sleep(2) # xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
# sleep(1)
# browser.find_element_by_xpath(xp).click()
close_alert(browser)
try:
xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
sleep(2)
close_alert(browser)
browser.find_element_by_xpath(xp)
except Exception:
print(sys._getframe().f_lineno, Exception)
return
close_alert(browser)
if browser.find_element_by_xpath(xp).text.find('全部') == -1:
return
res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
res_num = int(res_num)
page_num = 10
loop_breaker = math.ceil(res_num / page_num) close_alert(browser)
if res_num <= page_num:
write_res_html(browser)
browser.quit()
return
close_alert(browser)
xp = '//*[@id="place-widget-placenewlist-showall"]'
browser.find_element_by_xpath(xp).click()
write_res_html(browser)
close_alert(browser)
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
try:
xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1)
except Exception:
print(sys._getframe().f_lineno, Exception)
write_res_html(browser)
browser.quit()
return for i in range(1, loop_breaker, 1):
sleep(1)
try:
xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
sleep(3)
browser.find_element_by_xpath(xp).click()
except Exception:
print(sys._getframe().f_lineno, Exception)
sleep(10)
break
try:
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
except Exception:
print(sys._getframe().f_lineno, Exception)
sleep(10)
try:
xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
sleep(1)
print(input_, i)
browser.find_element_by_xpath(xp_newpage).click()
write_res_html(browser)
except Exception:
print(sys._getframe().f_lineno, Exception)
sleep(10)
sleep(2)
browser.quit() for city in target_dic:
for district in target_dic[city]:
for type_ in target_dic[city][district]:
for name_reduction in target_dic[city][district][type_]:
input_ = '%s%s%s' % (city, district, name_reduction)
mobile_mobile_pages_html(input_)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from time import sleep
import math url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
"deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options) browser.get(url_seed)
input_ = '深圳市南山区荟芳园' js = '%s%s%s' % ('document.getElementsByClassName("input-default js_input")[0].value="', input_, '"')
browser.execute_script(js)
xp_newpage = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1) xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
browser.find_element_by_xpath(xp).click() xp = '//*[@id="place-widget-placenewlist-showall"]/span[1]'
browser.find_element_by_xpath(xp)
res_num = browser.find_element_by_xpath(xp).text.split('全部')[1].split('条')[0]
res_num = int(res_num)
page_num = 10
loop_breaker = math.ceil(res_num / page_num) def write_res_html(browser, dir_='baidu_map_html/'):
current_url_ = '%s%s%s%s' % ('<!--', input_, browser.current_url, '-->')
page_source = '%s%s' % (current_url_, browser.page_source)
localtime_ = time.strftime("%y%m%d%H%M%S", time.localtime())
file_name = '%s%s%s%s' % (dir_, input_, localtime_, '.html')
fo = open(file_name, 'w', encoding='utf-8')
fo.write(page_source)
fo.closed xp = '//*[@id="place-widget-placenewlist-showall"]'
browser.find_element_by_xpath(xp).click()
write_res_html(browser) js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
xp_newpage = '//*[@id="fis_elm__7"]/div/div[2]/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
sleep(1) for i in range(1, loop_breaker, 1):
sleep(1)
xp = '//*[@id="common-bottombanner-widget-fis"]/div/div/div[2]'
browser.find_element_by_xpath(xp).click()
js = "window.scrollTo(0,document.body.scrollHeight)"
browser.execute_script(js)
sleep(1)
xp_newpage = '//*[@id="fis_elm_pager__qk_7"]/div/div/span[2]'
browser.find_element_by_xpath(xp_newpage).click()
write_res_html(browser)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options url_seed = 'http://m.sogou.com/web/searchList.jsp?pid=sogou-mobb-123asd-0007&keyword=百度地图'
mobile_emulation = {
"deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
"userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
chrome_options = Options()
chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
browser = webdriver.Chrome(chrome_options=chrome_options) browser.get(url_seed)
js = 'document.getElementsByClassName("input-default js_input")[0].value="深圳市南山区海岸城"'
browser.execute_script(js)
xp = '//*[@id="sogou_vr_21384401_1_wrap"]/div/div[1]/a'
browser.find_element_by_xpath(xp).click()
ua
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Opera/9.25 (Windows NT 5.1; U; en)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)
Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)
Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0
import os, sys
import time
import logging
import requests
import threading from random import choice
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities """
全局约定,便于后期做日志分析
os._exit(INT)
4001 4002 4003 4004
"""
os_sep = os.sep
this_file_abspath, this_file_name = os.path.dirname(os.path.abspath(__file__)), os.path.abspath(__file__).split(os_sep)[
-1]
base_dir = os.path.dirname(os_sep.join(os.path.abspath(__file__).split(os_sep)[0:-2]))
log_abspath = '%s%s%s' % (base_dir, os_sep, 'log') """
日志的记录不能依赖于日志类
"""
now_, e = time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), '启动脚本'
logf, s = '%s%s%s%s' % (log_abspath, os_sep, this_file_name, now_), '%s%s%s%s' % (__file__, now_, os.getcwd(), e)
with open(logf, 'a') as fo:
fo.write(s)
print(s) try:
sys.path.append(base_dir)
from core.utils import MysqlHelper
except Exception as e:
s = '%s%s%s' % (
'from core.utils import MysqlHelper EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())),
e)
with open(logf, 'a') as fo:
fo.write(s)
print(s)
os._exit(4001) try:
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s[thread:%(thread)d][process:%(process)d]',
datefmt='%a, %d %b %Y %H:%M:%S',
filename=logf,
filemode='a')
except Exception as e:
s = '%s%s%s' % ('logging.basicConfig EXCEPTION ', time.strftime('%Y%m%d_%H%M%S', time.localtime(time.time())), e)
with open(logf, 'a') as fo:
fo.write(s)
print(s)
os._exit(4002) try: fua, lua = '%s%s%s' % (this_file_abspath, os_sep,
'ua_list.txt'), []
with open(fua, 'r') as fo:
for i in fo:
lua.append(i.replace('\n', ''))
except Exception as e:
s = '%s%s' % ('打开文件 EXCEPTION ua文件路径: ', fua)
logging.error(s)
print(s)
os._exit(4003) dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = choice(lua)
dcap['browserName'], dcap['platform'] = '', '' class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.func, self.args, self.name = func, args, name def run(self):
self.func(self.args) ctrl_start, max_script_time = time.time(), 3600 * 4 def ctrl_runtime(exit_type=''):
if time.time() - ctrl_start >= max_script_time:
s = '%s%s%s%s%s%s%s%s%s' % (
'程序开始执行时间', ctrl_start, '执行时间阈值', max_script_time, '终止执行', ' exit_type =', exit_type, ' threadID ',
threading.get_ident())
logging.info(s)
if exit_type == '':
exit(s)
elif exit_type == 'sys':
sys.exit(s)
elif exit_type == 'os':
# an integer is required
# Required argument 'status' (pos 1) not found
os._exit(4004) url_counter = 0 def main():
"""
对异常无限重启
""" try:
mysql_obj = MysqlHelper()
q = 'SELECT direct_order_id FROM test_error;'
tuple_l = mysql_obj.select(q)
pass_id_l = [i[0] for i in tuple_l]
pass_id_l = [str(i) for i in pass_id_l]
pass_id_l_s = ','.join(pass_id_l)
del mysql_obj, tuple_l # 业务当前未失效的url在在test_order具有唯一行
#
"""
后期任务:
test_error积累一定数据后对url重新检测
#3个功能点:当前半个小时、当前未失效的url test_order内url的异常情况(当前的2个功能点)、(后期任务:test_error积累一定数据后对url重新检测) q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) - create_time<=3600*48 AND id NOT in ( %s ) ORDER BY id DESC ;' % (
pass_id_l_s) q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in ( %s ) ORDER BY id DESC ;' % (
pass_id_l_s) """ mysql_obj = MysqlHelper()
q = 'SELECT url,id FROM test_order WHERE unix_timestamp(now()) < expire_time AND id NOT in ( %s ) ORDER BY id DESC ;' % (
pass_id_l_s)
tuple_l = mysql_obj.select(q)
del mysql_obj
if len(tuple_l) == 0:
s = '无待检测url,程序退出'
print(s)
logging.info(s)
except Exception as e:
s = '%s%s%s' % ('初始数据,查询数据库异常,无限次重启该脚本', e, time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())))
print(s)
logging.warning(s)
cmd = 'python %s' % (__file__)
os.system(cmd)
os._exit(1024) # 考虑到每1小时执行下该脚本,对url异常的处理为:第一次请求为预期则终止请求,反之,间隔30后,再至多请求2次,每次间隔10s
sleep_counter, sleep_step, sleep_seconds, mycode_l, repeat_times, repeat_sleep_times = 0, 20, 1, [
'g3user.com', '51g3.com.cn'], 4, 10 # 重构到基类 where list
# d当前为为了f_l字段的需求改动
def get_onerow(url, f_l=['title', 'uid', 'money_total'], tab='test_order'):
t = -1
try:
mysql_obj = MysqlHelper()
f_s = ','.join(f_l)
q = 'SELECT %s FROM %s WHERE url="%s" ORDER BY id DESC LIMIT 1' % (f_s, tab, url)
s = '%s%s' % (' DB ', q)
logging.info(s)
t = mysql_obj.select(q)
if t != -1:
t = t[0]
del mysql_obj
except Exception as e:
s = '%s%s' % (' DB ', e)
logging.info(s)
return t
return t def chk_exception_url(url, sleep_seconds=0, http_tag='http://'):
time.sleep(sleep_seconds)
global url_counter ret = {}
# db url状态值 状态 0:打不开 1:打开无广告 2:已处理
ret['ok'], ret['status_code'], s = -1, -1, '%s%s%s%s' % (
time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
try:
if url.find('http') == -1:
url = '%s%s' % (http_tag, url)
r = requests.get(url)
ret['status_code'], txt_pos = int(r.status_code), -1
s = '%s,%s,%s,%s,%s' % (s, ret['status_code'], url, r, r.reason)
except Exception as e:
ret['ok'] = 0
s = '%s %s %s' % (s, ' SPIDER ', e)
logging.error(s)
print(e, url) # 当前,仅考虑目标站返回200
if ret['status_code'] == 200:
for ii in mycode_l:
if r.text.find(ii) > -1:
ret['ok'], txt_pos = 1, 1
break
if txt_pos == -1:
try:
driver = webdriver.PhantomJS(desired_capabilities=dcap,
executable_path='/usr/local/phantomjs/bin/phantomjs')
driver.get(url)
time.sleep(1)
page_source = driver.page_source
driver.quit()
for ii in mycode_l:
if page_source.find(ii) > -1:
ret['ok'] = 1
break
if ret['ok'] == -1:
s = '%s%s' % (s, '返回200,但是在html中未检测到我公司代码。')
ret['ok'], ret['info'] = 0, s
except Exception as e:
s = '%s %s %s' % (s, ' SPIDER ', e)
logging.error(s)
print(e, url) # elif ret['status_code'] == 403:
# www.hsdcw.com/fenlei/41668214.html
elif ret['status_code'] == 403:
pass
else:
ret['ok'], ret['info'] = 0, s url_counter += 1
s = '%s/%s%s%s' % (url_counter, len(tuple_l), 'chk-ret', s)
print(s)
if ret['ok'] == 0:
logging.warning(s)
else:
logging.info(s)
return ret tn, tl, tstep = len(tuple_l), [], 4000 def tf(ts): te = ts + tstep
te = min(te, tn)
for i in tuple_l[ts:te]:
ctrl_runtime(exit_type='os')
url, chk_id = i
s = '%s%s%s%s' % (
time.strftime('%Y%m%d %H:%M:%S', time.localtime(time.time())), ' threadID ', threading.get_ident(), url)
if chk_id in pass_id_l:
s = '%s%s' % (s, ' 跳过,之前test_error已写入该url ')
logging.info(s)
print(s)
"""
针对新浪爱问的规则: 不检测
"""
if url.find('iask.sina.com') > -1:
continue
write_db_flag = 1
for t in range(0, repeat_times, 1):
ret = chk_exception_url(url, repeat_sleep_times)
if ret['ok'] == 1:
write_db_flag = 0
break if write_db_flag == 1:
try:
title, uid, money_total = get_onerow(url)
except Exception as e:
s = '%s%s%s' % (s, ' DB Exception-去test_order查', e)
logging.info(s)
print(s)
break # 多线程 考虑到原包的 数据库限制,每次均实例化数据库类,用后删除
try:
# 可以考虑分装到类构造器中
mysql_obj = MysqlHelper()
except Exception as e:
s = '%s%s%s' % (s, ' DB Exception- ', e)
logging.error(s)
print(s)
break """
多进程、线程并发
待优化,比如队列
"""
q = 'SELECT id FROM test_error WHERE url="%s" LIMIT 1' % (url)
try:
r = mysql_obj.select(q)
s = '%s%s%s' % (s, ' -SQL- ', q)
logging.info(s)
print(q)
except Exception as e:
s = '%s %s %s %s' % (s, ' DB Exception-', q, e)
logging.info(s)
print(s)
break ctime = int(time.time())
# 建议优化此处数据库设计
db_status = 1 if ret['status_code'] == 200 else 0
if len(r) == 0:
q = 'INSERT INTO test_error (title,url,status,remarks,update_time,create_time,uid,money,direct_order_id) VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
title, url, db_status, ret['info'], ctime, ctime, uid, money_total, chk_id)
try:
mysql_obj.execute(q)
mysql_obj.commit()
del mysql_obj
s = '%s%s%s' % (s, ' DB SQL ok ', q)
logging.info(s)
print(s)
except Exception as e:
s = '%s%s%s%s' % (s, ' DB Exception- ', q, e)
logging.error(s)
print(s) elif len(r) == 1:
continue for i in range(0, tn, tstep):
if i >= tn:
break
thread_instance = MyThread(tf, (i), tf.__name__)
tl.append(thread_instance) for t in tl:
t.setDaemon = False
t.start()
for t in tl:
t.join() if __name__ == '__main__':
main()
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.firefox.options import Options
import time
from time import sleep
import math
import random
import sys
import threading
from random import choice
# import urllib.parse
from bs4 import BeautifulSoup ua_list = []
with open('mobile_ua.txt', 'r', encoding='utf-8') as uafile:
for i in uafile:
if i.find('Mozilla') > -1:
ua_list.append(i.replace('\n', '').strip()) ua_list_len_ = len(ua_list) - 1 def close_alert(browser, attitude='accept'):
# js='alert(window.alert=function(str){return;}'
# browser.execute_script(js) # js= 'window.alert = function(str){return ;}'
# browser.execute_script(js)
return # mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
# "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
ua_list_index = random.randint(0, ua_list_len_)
# mobile_emulation = {
# "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0}}
#
# mobile_emulation['userAgent'] = choice(ua_list)
# chrome_options = Options()
# chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
# browser = webdriver.Chrome(chrome_options=chrome_options)
browser = webdriver.Chrome()
s_wd = '长尾'
url_seed = 'https://m.baidu.com/s?word=s_wd' url_seed = url_seed.replace('s_wd', s_wd)
print(url_seed)
browser.get(url_seed) rd = BeautifulSoup(browser.page_source, 'html.parser').find_all('a', class_='rw-item')
res_d_l = [{'contents': d.contents, 'href': d.attrs['href']} for d in rd]
browser.quit()
d = 3