前言
娱乐项目记载:爬取网络上的小说
一、演示
二、文件目录示意
三、使用步骤
1.引入库
代码如下(示例):
import requests
from lxml import html #调用lxml模块和requests模块
from pangchong import Worker
import webbrowser
import time,os
from Ui_dowondstory import Ui_MainWindow
import sys
from PyQt5.QtGui import QIcon,QDesktopServices # 用于添加图标
from PyQt5.QtWidgets import QMainWindow,QApplication
from PyQt5.QtCore import QUrl
2.界面控制程序
代码如下:
#_*_ coding:utf-8 _*_
'''
#1.获取书名
#2.获取链接和目录名
#3.获取内容
#4.保存内容'''
import requests
from lxml import html #调用lxml模块和requests模块
from pangchong import Worker
import webbrowser
import time,os
from Ui_dowondstory import Ui_MainWindow
import sys
from PyQt5.QtGui import QIcon,QDesktopServices # 用于添加图标
from PyQt5.QtWidgets import QMainWindow,QApplication
from PyQt5.QtCore import QUrl
class LanFei_show_window(QMainWindow,Ui_MainWindow): # 继承至界面文件的主窗口类
def __init__(self):
super().__init__() # 使用超类,继承父类的属性及方法
self.setupUi(self) # 构造窗体界面
self.setWindowIcon(QIcon("./IMG/icon/icon.jpg"))
self.setWindowTitle("测试使用") # 设置窗体主体
self.initUI() # 构造功能函数
def initUI(self):
self.pushButton.clicked.connect(self.openurl)
self.pushButton_2.clicked.connect(self.dowtext)
self.lineEdit.setText("https://www.xtyxsw.org/read/280637/")
def click_textbrowser(self):
self.msg = os.getcwd()
QDesktopServices.openUrl(QUrl.fromLocalFile(self.msg))
# self.textBrowser.append("<a href=\"C:/\">{}:{}</a>".format(self.gettime(),"完成下载"))
def openurl(self):
#此处添加功能函数
geturl = self.lineEdit.text()
print(geturl)
print("打开网址:{}".format(geturl))
if geturl != "":
webbrowser.open(geturl)
else:
self.textBrowser.append("<font color=\"#FF0000\">{}:请先输入网址路径!</font> ".format(self.gettime()))
def gettime(self):
# 获取当前时间
time_show = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
return time_show
# 连接下载按钮
def dowtext(self):
geturl = self.lineEdit.text()
# print(geturl)
# print("下载数据:{}".format(geturl))
if self.pushButton_2.text() == "下载":
if geturl != "":
self.test(geturl)
self.pushButton_2.setText("停止")
else:
self.textBrowser.append("<font color=\"#FF0000\">{}:请先输入网址路径!</font> ".format(self.gettime()))
elif self.pushButton_2.text() == "停止":
self.worker.change_ret()
def test(self,url):
# url = 'https://www.clewx.com/book/202011/29/11263.html'
# url = "https://www.xtyxsw.org/read/130638/" # 神秘世界
# url = "https://www.xtyxsw.org/read/293323/" # 开局囤积SSS级卡牌
book_name = self.get_book_url(url)
print("获取书名:" + book_name)
self.textBrowser.append("{}:".format(self.gettime())+"获取书名--" + book_name)
htmls_list,name_list = self.get_dir(url) #获取链接
#print(htmls_list)
#print(name_list)
self.data = [book_name,name_list,htmls_list]
# 创建工作线程的工作对象
self.worker = Worker(msg=self.data)
# 连接信号与槽
self.worker.finished.connect(self.receive)
self.worker.start()
def get_url(self,url):
hl = requests.get(url) # 获取源码
hl = hl.content.decode("utf-8")
return hl
'''获取书名'''
def get_book_url(self,url):
#首先咱们调用模块然后解析这个网页
selector = html.fromstring(self.get_url(url))
# shumin = selector.xpath('//div[@class = "con_top"]/h1/text()')
shumin = selector.xpath('/html/body/div[3]/div[2]/div/span/text()')
# print("获取书名:" + str(shumin[0]))
return shumin[0]
def get_dir(self,url):
'''获取链接和目录名'''
htmls_list = [] #创建一个空列表来存储所有章节链接
names_list = []
hl = self.get_url(url)
selector = html.fromstring(hl)
html_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/@href') #获得链接列表
name_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/text()')
#将链接与网页网址连接,形成每一章的网址
for i in html_list:
shuju = str(i)
htmls_list.append(shuju)
for i in name_list:
shuju = str(i)
names_list.append(shuju)
print("每章节链接:" + str(htmls_list) )
print("每章节目录:" + str(names_list))
print(len(names_list))
return htmls_list,names_list
def receive(self,text=[]):
if text[0] == 1:
self.textBrowser.append("<font color=\"#0000FF\">{}:{}</font> ".format(self.gettime(),text[1]))
if text[0] == 2:
self.textBrowser.append("<font color=\"#FF0000\">{}:{}</font> ".format(self.gettime(),text[1]))
if text[0] == 3:
self.textBrowser.setOpenLinks(False)
self.textBrowser.setOpenExternalLinks(False)
# self.textBrowser.append("<a href=\"%s\">超链接测试</a>" % ("完成下载"))
self.textBrowser.append("<a href=\"%s\">{}:{}</a>".format(self.gettime(),text[1]))
self.textBrowser.anchorClicked.connect(self.click_textbrowser) #连接函数
self.pushButton_2.setText("下载")
if __name__ == "__main__":
app = QApplication(sys.argv)
ui2 = LanFei_show_window()
ui2.show()
sys.exit(app.exec_())
代码如下:
# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'd:\pythonitem\爬虫小说\dowondstory.ui'
#
# Created by: PyQt5 UI code generator 5.15.11
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again. Do not edit this file unless you know what you are doing.
from PyQt5 import QtCore, QtGui, QtWidgets
class Ui_MainWindow(object):
def setupUi(self, MainWindow):
MainWindow.setObjectName("MainWindow")
MainWindow.resize(579, 368)
self.centralwidget = QtWidgets.QWidget(MainWindow)
self.centralwidget.setObjectName("centralwidget")
self.gridLayout = QtWidgets.QGridLayout(self.centralwidget)
self.gridLayout.setObjectName("gridLayout")
self.label = QtWidgets.QLabel(self.centralwidget)
self.label.setObjectName("label")
self.gridLayout.addWidget(self.label, 0, 0, 1, 1)
self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
self.lineEdit.setObjectName("lineEdit")
self.gridLayout.addWidget(self.lineEdit, 0, 1, 1, 1)
self.pushButton = QtWidgets.QPushButton(self.centralwidget)
self.pushButton.setObjectName("pushButton")
self.gridLayout.addWidget(self.pushButton, 0, 2, 1, 1)
self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
self.pushButton_2.setObjectName("pushButton_2")
self.gridLayout.addWidget(self.pushButton_2, 0, 3, 1, 1)
self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
self.textBrowser.setObjectName("textBrowser")
self.gridLayout.addWidget(self.textBrowser, 1, 0, 1, 4)
MainWindow.setCentralWidget(self.centralwidget)
self.menubar = QtWidgets.QMenuBar(MainWindow)
self.menubar.setGeometry(QtCore.QRect(0, 0, 579, 23))
self.menubar.setObjectName("menubar")
MainWindow.setMenuBar(self.menubar)
self.statusbar = QtWidgets.QStatusBar(MainWindow)
self.statusbar.setObjectName("statusbar")
MainWindow.setStatusBar(self.statusbar)
self.retranslateUi(MainWindow)
QtCore.QMetaObject.connectSlotsByName(MainWindow)
def retranslateUi(self, MainWindow):
_translate = QtCore.QCoreApplication.translate
MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
self.label.setText(_translate("MainWindow", "下载地址:"))
self.pushButton.setText(_translate("MainWindow", "打开"))
self.pushButton_2.setText(_translate("MainWindow", "下载"))
3.QT业务控制程序
代码如下:
import requests
import os
from lxml import html #调用lxml模块和requests模块
import time
import time
from PyQt5.QtCore import QThread,pyqtSignal
import threading
class Worker(QThread):
finished = pyqtSignal(list)
def __init__(self,msg=None):
super().__init__()
self.msg = msg
self.ret = "True"
def run(self):
# 在这里执行耗时的操作
for number in range(0,int(len(self.msg[1]))):
if self.ret == "break":
i = 2
self.finished.emit([i,"已停止下载!"])
break
t1 = threading.Thread(target=self.save(self.msg[0],self.msg[1],self.msg[2],number))
t1.start()
t1.join()
i = 3
self.finished.emit([i,"完成下载!"])
def change_ret(self):
self.ret = "break"
def get_url(self,url):
hl = requests.get(url) # 获取源码
hl = hl.content.decode("utf-8")
return hl
'''获取内容'''
def get_neirong(self,htmls_list,number):
url = htmls_list[number]
# 主网址默认
url = "https://www.xtyxsw.org" +url
print("网址:" + url)
txt = ""
selector = html.fromstring(self.get_url(url))
txt_list = selector.xpath('//div[@id="content"]/p/text()')
#print(txt_list)
liebiao = []
for i in txt_list:
i = i[0:]
#print(i)
liebiao.append(i)
txts = selector.xpath('//a/text()') # // //div[@id="A3"]/a/text() //*[@id="A3"]
# print(txts)
if "下一页" in txts:
dizhi = selector.xpath('//a/@href')
print(dizhi)
url = "https://www.xtyxsw.org" + dizhi[-4]
dizhi = html.fromstring(self.get_url(url))
txt_lists = dizhi.xpath('//div[@id="content"]/p/text()')
for i in txt_lists:
i = i[0:]
#print(i)
liebiao.append(i)
#print(liebiao) #打印内容
return liebiao
'''保存内容'''
def save(self,book_name,name_list,htmls_list,number):
path1 = os.getcwd()
path = path1+"\\" + str(book_name)
if os.path.isdir(path): #判断文件夹目录是否存在
#print(str(path)+":文件夹已经存在!")
pass
else:
os.mkdir(path)
if number < 0:
return
liebiao = self.get_neirong(htmls_list,number)
# print("文本内容:",liebiao)
mulu = str(name_list[int(number)])
mulu = mulu.replace("?","")
paths = str(path) + "\\" + mulu +".txt"
with open(paths,"w",encoding= "utf-8") as file:
for wenzhi in liebiao:
file.write(wenzhi +"\n")
print("完成第" + str(int(number)+1) + "章写入!")
i=1
h= "完成第" + str(int(number)+1) + "章写入!"
self.finished.emit([i,h])
time.sleep(0.5)
def finisheds(self,i,h=None):
self.finished.emit([i,h])
4.批量修改文件名称
>xiugainame.py:修改文件名称程序
代码如下:
import os
'''修改文件名称'''
path = "./末日重生:开局囤积SSS级卡牌小说"
files = os.listdir(path)
print(files)
liebiao1 = ["零","一","二","三","四","五","六","七","八","九"]
liebiao2 = ["十","百","千"]
liebiao3 = ["0","1","2","3","4","5","6","7","8","9"]
for shuju in files:
new_name = []
for name in shuju:
jishu=1
if name in liebiao1:
print(name)
print(liebiao1.index(name))
shuzhi = liebiao1.index(name)
changnumber = liebiao3[shuzhi]
new_name.append(changnumber)
elif name in liebiao2:
if shuju[1] == "十" and shuju[2] == "章":
new_name.append("10")
if shuju[1] == "十" and shuju[2] != "章":
new_name.append("1")
if shuju[2] == "十" and shuju[3] == "章":
new_name.append("0")
if shuju[2] == "百" and shuju[3] == "章":
new_name.append("00")
if shuju[2] == "百" and shuju[5] == "章":
new_name.append("0")
else:
new_name.append(name)
jishu += 1
print(new_name)
combined_string = ''
for string in new_name:
combined_string += string
print(combined_string)
# 源文件路径
old_path = path + "/" + shuju
# 新文件名
new_name = path + "/" + combined_string
# 修改文件名
try:
os.rename(old_path, new_name)
except FileNotFoundError:
print("源文件未找到")
except PermissionError:
print("权限不足,无法修改文件名")
总结
娱乐使用,仅供参考,不同的网站可能格式不同,大家自行专研,嘿嘿。