- 小说网址
- 使用绝色妖娆:鬼医至尊为例
下面是程序代码
#coding:utf-8
import requests
import threading
from bs4 import BeautifulSoup
import re
import os
import time
import sys
import threading
req_header={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cookie':'UM_distinctid=162afbabff819e-03f2f082776e95-b34356b-1fa400-162afbabff9294; CNZZDATA1259019190=1993576859-1523364262-https%253A%252F%252Fwww.baidu.com%252F%7C1523364262; bookid=124629; chapterid=6510968; chaptername=%25u7B2C1%25u7AE0%2520%25u6797%25u4E2D%25u9634%25u8C0B',
'Host':'www.uxiaoshuo.com',
'Proxy-Connection':'keep-alive',
'Referer':'https://www.uxiaoshuo.com/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
req_url_base='http://www.uxiaoshuo.com' #小说主地址
def get_txt(txt_id):
txt={}
_req_url=[]
txt['title']=''
txt['id']=str(txt_id) #小说编号
_req_url=txt['id'].split('.')
req_url=req_url_base+ txt['id'] #根据小说编号获取小说URL
print("小说编号:"+req_url)
try:
res=requests.get(req_url, params=req_header) #获取小说第一章界面
soups=BeautifulSoup(res.text,"html.parser") #soup转化
#获取小说题目
txt['title']=soups.select('#webhtml .box_con .con_top a')[1].text
#打开小说文件写入小说相关信息
fo = open('{0}.txt'.format(txt['title']), "ab+")
#循环写入章节内容
while 1:
if _req_url[-1]!='html':
print(txt['title']+"全部下载成功!")
break
txt['c_title']=soups.select('#webhtml .box_con .zhangjieming h1')[0].text ##章节名称
txt['content']=soups.select('#webhtml .box_con .zhangjieTXT')[0]
for i in txt['content'].select("script"): #去除无用内容
i.decompose()
for i in txt['content'].select("div"):
i.decompose()
txt['content']=re.sub( '\s+', '\r\n\t', txt['content'].text).strip('\r\n')
#以二进制写入章节题目
fo.write(('\n'+txt['c_title']+'\r\n').encode('UTF-8'))
#以二进制写入章节内容
fo.write(('\n'+txt['content']+'\n').encode('UTF-8'))
print(txt['c_title'])
# print('章节名:'+txt['c_title'])
# print("章节内容:\n"+txt['content'])
req_url=soups.select('#webhtml .zhangjieming .bottem1 a')[3]['href']
_req_url=req_url.split('.')
req_url=req_url_base+req_url
res=requests.get(req_url, params=req_header) #获取下一章界面
soups=BeautifulSoup(res.text,"html.parser") #soup转化
except Exception as e:
print(e)
finally:
return
get_txt('/124/124629/7404934.html')
get_txt('/135/135169/7373986.html')
要爬取该网站内的小说,只需要将参数改成自己需要的小说就可以了