前言

娱乐项目记载:爬取网络上的小说


一、演示

Python --爬虫小说学习-仅限于个人娱乐-LMLPHP

二、文件目录示意

Python --爬虫小说学习-仅限于个人娱乐-LMLPHP

三、使用步骤

1.引入库

代码如下(示例):

import requests
from lxml import html #调用lxml模块和requests模块
from pangchong import Worker
import webbrowser
import time,os
from Ui_dowondstory import Ui_MainWindow
import sys
from PyQt5.QtGui import QIcon,QDesktopServices  # 用于添加图标
from PyQt5.QtWidgets import QMainWindow,QApplication
from PyQt5.QtCore import QUrl

2.界面控制程序

代码如下:

#_*_ coding:utf-8 _*_

'''
#1.获取书名
#2.获取链接和目录名
#3.获取内容
#4.保存内容'''

import requests
from lxml import html #调用lxml模块和requests模块
from pangchong import Worker
import webbrowser
import time,os
from Ui_dowondstory import Ui_MainWindow
import sys
from PyQt5.QtGui import QIcon,QDesktopServices  # 用于添加图标
from PyQt5.QtWidgets import QMainWindow,QApplication
from PyQt5.QtCore import QUrl


class LanFei_show_window(QMainWindow,Ui_MainWindow):  # 继承至界面文件的主窗口类
    def __init__(self):
        super().__init__()  # 使用超类,继承父类的属性及方法
        self.setupUi(self)  # 构造窗体界面
        self.setWindowIcon(QIcon("./IMG/icon/icon.jpg"))
        self.setWindowTitle("测试使用")  # 设置窗体主体
        self.initUI()  # 构造功能函数
    
    def initUI(self):
        self.pushButton.clicked.connect(self.openurl)
        self.pushButton_2.clicked.connect(self.dowtext)
        self.lineEdit.setText("https://www.xtyxsw.org/read/280637/")

    def click_textbrowser(self):
        self.msg = os.getcwd()
        QDesktopServices.openUrl(QUrl.fromLocalFile(self.msg))
        # self.textBrowser.append("<a href=\"C:/\">{}:{}</a>".format(self.gettime(),"完成下载")) 
        
    def openurl(self):
        #此处添加功能函数
        geturl = self.lineEdit.text()
        print(geturl)
        print("打开网址:{}".format(geturl))
        if geturl != "":
            webbrowser.open(geturl)
        else:
            self.textBrowser.append("<font color=\"#FF0000\">{}:请先输入网址路径!</font> ".format(self.gettime())) 

    def gettime(self):
        # 获取当前时间
        time_show = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
        return time_show

    # 连接下载按钮
    def dowtext(self):
        geturl = self.lineEdit.text()
        # print(geturl)
        # print("下载数据:{}".format(geturl))
        if self.pushButton_2.text() == "下载":
            if geturl != "":
                self.test(geturl)
                self.pushButton_2.setText("停止")
            else:
                self.textBrowser.append("<font color=\"#FF0000\">{}:请先输入网址路径!</font> ".format(self.gettime())) 
        elif self.pushButton_2.text() == "停止":
            self.worker.change_ret()
       

    def test(self,url):
        # url = 'https://www.clewx.com/book/202011/29/11263.html'
        # url = "https://www.xtyxsw.org/read/130638/"  # 神秘世界
        # url = "https://www.xtyxsw.org/read/293323/"  # 开局囤积SSS级卡牌
        
        book_name = self.get_book_url(url)
        print("获取书名:" + book_name)

        self.textBrowser.append("{}:".format(self.gettime())+"获取书名--" + book_name)

        htmls_list,name_list = self.get_dir(url) #获取链接
        #print(htmls_list)
        #print(name_list)

        self.data = [book_name,name_list,htmls_list]

        # 创建工作线程的工作对象
        self.worker = Worker(msg=self.data)
        # 连接信号与槽
        self.worker.finished.connect(self.receive)
        self.worker.start()
    
    def get_url(self,url):
        hl = requests.get(url)  # 获取源码
        hl = hl.content.decode("utf-8")
        return hl


    '''获取书名'''
    def get_book_url(self,url):
        #首先咱们调用模块然后解析这个网页
        selector = html.fromstring(self.get_url(url))
        # shumin = selector.xpath('//div[@class = "con_top"]/h1/text()')
        shumin = selector.xpath('/html/body/div[3]/div[2]/div/span/text()')

        # print("获取书名:" + str(shumin[0]))
        return shumin[0]


    def get_dir(self,url):
        '''获取链接和目录名'''
        htmls_list = []         #创建一个空列表来存储所有章节链接
        names_list = []
        hl = self.get_url(url)
        selector = html.fromstring(hl)

        html_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/@href')  #获得链接列表
        name_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/text()')
        #将链接与网页网址连接,形成每一章的网址
        for i in html_list:
            shuju = str(i)
            htmls_list.append(shuju)

        for i in name_list:
            shuju = str(i)
            names_list.append(shuju)
        
        print("每章节链接:" + str(htmls_list) )
        print("每章节目录:" + str(names_list))
        print(len(names_list))
        return htmls_list,names_list


    def receive(self,text=[]):
        if text[0] == 1:
            self.textBrowser.append("<font color=\"#0000FF\">{}:{}</font> ".format(self.gettime(),text[1]))
        if text[0] == 2:
            self.textBrowser.append("<font color=\"#FF0000\">{}:{}</font> ".format(self.gettime(),text[1]))    
        if text[0] == 3:
            self.textBrowser.setOpenLinks(False)
            self.textBrowser.setOpenExternalLinks(False)
            # self.textBrowser.append("<a href=\"%s\">超链接测试</a>" % ("完成下载"))
            self.textBrowser.append("<a href=\"%s\">{}:{}</a>".format(self.gettime(),text[1]))    
            self.textBrowser.anchorClicked.connect(self.click_textbrowser)  #连接函数
            self.pushButton_2.setText("下载")


if __name__ == "__main__":
    app = QApplication(sys.argv)
    ui2 = LanFei_show_window()
    ui2.show()
    sys.exit(app.exec_())

代码如下:

# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'd:\pythonitem\爬虫小说\dowondstory.ui'
#
# Created by: PyQt5 UI code generator 5.15.11
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again.  Do not edit this file unless you know what you are doing.


from PyQt5 import QtCore, QtGui, QtWidgets


class Ui_MainWindow(object):
    def setupUi(self, MainWindow):
        MainWindow.setObjectName("MainWindow")
        MainWindow.resize(579, 368)
        self.centralwidget = QtWidgets.QWidget(MainWindow)
        self.centralwidget.setObjectName("centralwidget")
        self.gridLayout = QtWidgets.QGridLayout(self.centralwidget)
        self.gridLayout.setObjectName("gridLayout")
        self.label = QtWidgets.QLabel(self.centralwidget)
        self.label.setObjectName("label")
        self.gridLayout.addWidget(self.label, 0, 0, 1, 1)
        self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
        self.lineEdit.setObjectName("lineEdit")
        self.gridLayout.addWidget(self.lineEdit, 0, 1, 1, 1)
        self.pushButton = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton.setObjectName("pushButton")
        self.gridLayout.addWidget(self.pushButton, 0, 2, 1, 1)
        self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
        self.pushButton_2.setObjectName("pushButton_2")
        self.gridLayout.addWidget(self.pushButton_2, 0, 3, 1, 1)
        self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
        self.textBrowser.setObjectName("textBrowser")
        self.gridLayout.addWidget(self.textBrowser, 1, 0, 1, 4)
        MainWindow.setCentralWidget(self.centralwidget)
        self.menubar = QtWidgets.QMenuBar(MainWindow)
        self.menubar.setGeometry(QtCore.QRect(0, 0, 579, 23))
        self.menubar.setObjectName("menubar")
        MainWindow.setMenuBar(self.menubar)
        self.statusbar = QtWidgets.QStatusBar(MainWindow)
        self.statusbar.setObjectName("statusbar")
        MainWindow.setStatusBar(self.statusbar)

        self.retranslateUi(MainWindow)
        QtCore.QMetaObject.connectSlotsByName(MainWindow)

    def retranslateUi(self, MainWindow):
        _translate = QtCore.QCoreApplication.translate
        MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
        self.label.setText(_translate("MainWindow", "下载地址:"))
        self.pushButton.setText(_translate("MainWindow", "打开"))
        self.pushButton_2.setText(_translate("MainWindow", "下载"))

3.QT业务控制程序

代码如下:

import requests
import os
from lxml import html #调用lxml模块和requests模块
import time
import time
from PyQt5.QtCore import QThread,pyqtSignal
import threading

class Worker(QThread):
    finished = pyqtSignal(list)
    
    def __init__(self,msg=None):
        super().__init__()
        self.msg = msg
        self.ret = "True"

    def run(self):
        # 在这里执行耗时的操作
        for number in range(0,int(len(self.msg[1]))):
            if self.ret == "break":
                i = 2
                self.finished.emit([i,"已停止下载!"])
                break
            t1 = threading.Thread(target=self.save(self.msg[0],self.msg[1],self.msg[2],number))
            t1.start()
            t1.join()
        i = 3
        self.finished.emit([i,"完成下载!"])


    def change_ret(self):
        self.ret = "break"

    def get_url(self,url):
        hl = requests.get(url)  # 获取源码
        hl = hl.content.decode("utf-8")
        return hl


    '''获取内容'''
    def get_neirong(self,htmls_list,number):
        url = htmls_list[number]
        # 主网址默认
        url = "https://www.xtyxsw.org" +url
        print("网址:" + url)
        txt = ""
        selector = html.fromstring(self.get_url(url))

        txt_list = selector.xpath('//div[@id="content"]/p/text()')
        #print(txt_list)
        liebiao = []
        for i in txt_list:
            i = i[0:]
            #print(i)
            liebiao.append(i)

        txts = selector.xpath('//a/text()') # // //div[@id="A3"]/a/text() //*[@id="A3"]  
        # print(txts)
        if "下一页" in txts:
            dizhi = selector.xpath('//a/@href')
            print(dizhi)
            url = "https://www.xtyxsw.org" + dizhi[-4]
            dizhi =  html.fromstring(self.get_url(url))

            txt_lists = dizhi.xpath('//div[@id="content"]/p/text()')
            for i in txt_lists:
                i = i[0:]
                #print(i)
                liebiao.append(i)

        #print(liebiao)   #打印内容
        return liebiao

    '''保存内容'''
    def save(self,book_name,name_list,htmls_list,number):
        path1 = os.getcwd()
        path = path1+"\\" + str(book_name)
        if os.path.isdir(path):                #判断文件夹目录是否存在
            #print(str(path)+":文件夹已经存在!")
            pass
        else:
            os.mkdir(path)
        if number < 0:
            return

        liebiao = self.get_neirong(htmls_list,number)
        # print("文本内容:",liebiao)

        mulu = str(name_list[int(number)])
        mulu = mulu.replace("?","")

        paths = str(path) + "\\" + mulu +".txt"

        with open(paths,"w",encoding= "utf-8") as file:
            for wenzhi in liebiao:
                file.write(wenzhi +"\n")
        print("完成第" + str(int(number)+1) + "章写入!")
        i=1
        h= "完成第" + str(int(number)+1) + "章写入!"
        self.finished.emit([i,h])
        time.sleep(0.5)

    def finisheds(self,i,h=None):
        self.finished.emit([i,h])


4.批量修改文件名称

Python --爬虫小说学习-仅限于个人娱乐-LMLPHP

Python --爬虫小说学习-仅限于个人娱乐-LMLPHP>xiugainame.py:修改文件名称程序

代码如下:

import os


'''修改文件名称'''

path = "./末日重生:开局囤积SSS级卡牌小说"
files = os.listdir(path)
print(files)


liebiao1 = ["零","一","二","三","四","五","六","七","八","九"]

liebiao2 = ["十","百","千"]

liebiao3 = ["0","1","2","3","4","5","6","7","8","9"]

for shuju in files:
    new_name = []

    for name in shuju:
        jishu=1
        if name in liebiao1:
            print(name)
            print(liebiao1.index(name))
            shuzhi = liebiao1.index(name) 
            changnumber = liebiao3[shuzhi]
            new_name.append(changnumber)
        elif name in liebiao2:
            if shuju[1] == "十" and shuju[2] == "章":
                new_name.append("10")
            if shuju[1] == "十" and shuju[2] != "章":
                new_name.append("1")
            if shuju[2] == "十" and shuju[3] == "章":
                new_name.append("0")
            if shuju[2] == "百" and shuju[3] == "章":
                new_name.append("00")
            if shuju[2] == "百" and shuju[5] == "章":
                new_name.append("0")
        else:
            new_name.append(name)
        jishu += 1
       
    print(new_name)

    combined_string = ''
    for string in new_name:
        combined_string += string
    print(combined_string)

    # 源文件路径
    old_path = path + "/" + shuju
    # 新文件名
    new_name = path + "/" + combined_string

    # 修改文件名
    try:
        os.rename(old_path, new_name)
    except FileNotFoundError:
        print("源文件未找到")
    except PermissionError:
        print("权限不足,无法修改文件名")

总结

娱乐使用,仅供参考,不同的网站可能格式不同,大家自行专研,嘿嘿。

Python --爬虫小说学习-仅限于个人娱乐-LMLPHP

10-18 19:26