首先先把url分割

url = 'https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875'
ends = "robots.txt" url = url.split('/')
print(url)

输出

再使用数据清洗,取出前3个,用/连接起来再利用urljoin连接起来ends

from urllib.parse import urljoin

url = 'https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875'
ends = "robots.txt" url = url.split('/')
url = '/'.join(url[:3])
url = urljoin(url,ends)
print(url)

输出

import requests
html = requests.get(url)
print(html.text)

打印结果为

G:\python3.8\python.exe "F:/python post/code/RobotsTest.py"
User-agent: Baiduspider
Disallow: /baidu
Disallow: /s?
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Googlebot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: MSNBot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Baiduspider-image
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: YoudaoBot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Sogou web spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Sogou inst spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Sogou spider2
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Sogou blog
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Sogou News Spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Sogou Orion spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: ChinasoSpider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: Sosospider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: yisouspider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: EasouSpider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh User-agent: *
Disallow: / Process finished with exit code 0

现在把内容写入robots.txt,再读取出来判断我们头是否存在robots里面的禁止列表

现在我们架设头为Googlebot,来判断是否存在

headers = {'user-agent':'Googlebot'}
with open('robots.txt','w',encoding='utf-8') as f:
f.write(html.text) with open('robots.txt','r',encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line.strip().replace('\n','')

现在就再line里面,再来判断是否再line里面,然后获取disallow值,存入一个新的里面,这里来个标志flag

lines = f.readlines()
domain = []
flag = False
for line in lines:
line.strip().replace('\n','')
if headers['user-agent'] in line:
flag = True
continue
elif line.startswith('Disallow'):
if flag is True:
domain.append(line.replace('Disallow',''))
elif line is None or line == '':
if flag is True:
break

这里来封装一下完整代码

from urllib.parse import urljoin
import requests # url = 'https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875'
# ends = "robots.txt" class Robots:
def __init__(self,url,Agent):
self.Baseurl = url
self.url = url
self.headers = {'user-agent':Agent}
self.ends = 'robots.txt'
self.Dourl()
def Dourl(self):
url = self.url.split('/')
url = '/'.join(url[:3])
url = urljoin(url, self.ends)
self.url = url def getRobots(self):
html = requests.get(self.url)
with open('robots.txt', 'w', encoding='utf-8') as f:
f.write(html.text) with open('robots.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
domain = []
flag = False
for line in lines:
line = line.strip().replace('\n', '')
if self.headers['user-agent'] in line:
flag = True
continue
elif line.startswith('Disallow'):
if flag is True:
domain.append(line.replace('Disallow: ',''))
elif line is None or line == '':
if flag is True:
break
for d in domain:
if d in self.Baseurl:
print("网站禁止爬取")
return False
return True if __name__ == '__main__':
url = input('url is >>')
agent = input('agent is >>')
# url = 'https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0xc6f64e0200000143&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=1&rsv_sug7=100&rsv_btype=i&inputT=993&rsv_sug4=993'
# agent = 'Googlebot'
r = Robots(url, agent)
print(r.getRobots())
F:\python post\code>python RobotsTest.py
url is >>https://www.baidu.com/s?wd=123&rsv_spt=1&rsv_iqid=0x8d22781d000014ad&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_dl=tb&rsv_enter=0&rsv_sug3=3&rsv_sug1=2&rsv_sug7=100&rsv_btype=i&inputT=875&rsv_sug4=875
agent is >>Googlebot
网站禁止爬取
False F:\python post\code>
05-11 22:45