python爬56听书网的有声小说,网站地址:www.ting56.com,
单线程
import os
import requests
from lxml import etree
import re
def get_url(url): #获取源码
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'}
res = requests.get(url , headers=headers)
res.encoding = res.apparent_encoding
if res.status_code == 404:
print('获取源码失败,退出程序')
os._exit(0)
else:
return res.text
def get_con(url):
xml = get_url(url)
ssrl = re.search('var datas=\S*', xml)[0].split('*') #正则获取加密链接数组并且按照*号分割
murl = ''
for i in range(1,int(len(ssrl)) - 1): #遍历获取的数字并转换成ASCII码然后拼接链接!
s = ssrl[i]
p = chr(int(s))
murl += p
ms = murl.split('&') #分割拼接出来的链接,取得包含音频链接的的列表
return ms
def get_list(url):
res = get_url(url)
list999 = etree.HTML(res) #转换xpath
name = list999.xpath('//div[@class="tit"]/h1/text()') #书名
lists = list999.xpath('//*[@id="vlink_1"]/ul/li/a/@href') #所有章节链接
count = int(list999.xpath('count(//*[@id="vlink_1"]/ul/li)')) #章节数量
path = 'D:/' + name[0].strip() + '/'
if not os.path.exists(path):
os.makedirs(path)
i = 1
for list in lists:
surl = 'http://www.ting56.com' + list #拼接url
conlist = get_con(surl)
conurl = get_con(surl)
suffix = conurl[0][-4:] #取后缀名
res = requests.get(conurl[0]).content
print(f'正在下载【{name[0].strip()}】的第{i}/{count}章,当前音频所在url:{surl}')
with open(path + str(i) + suffix, 'wb+') as f:
f.write(res)
i += 1
# url = input('输入56听书网图书目录网址:')
url = input('输入图书所在链接')
# url = 'http://www.ting56.com/mp3/21163.html'
get_list(url)
多线程
import os
import re
import threading
import time
import requests
from lxml import etree
def get_url(url): #获取源码
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}
res = requests.get(url , headers=headers)
res.encoding = res.apparent_encoding
if res.status_code == 404:
print('获取源码失败,退出程序')
os._exit(0)
else:
return res.text
def get_con(url):
xml = get_url(url)
ssrl = re.search('var datas=\S*', xml)[0].split('*') #正则获取加密链接数组并且按照*号分割
murl = ''
for i in range(1,int(len(ssrl)) - 1): #遍历获取的数字并转换成 Unicode 值然后拼接!
s = ssrl[i]
p = chr(int(s))
murl += p
ms = murl.split('&') #分割拼接出来的链接,取得包含音频链接的的列表
return ms[0]
def get_list(url):
res = get_url(url)
xml = etree.HTML(res) #转换xpath
name = xml.xpath('//div[@class="tit"]/h1/text()') #书名
lists = xml.xpath('//*[@id="vlink_1"]/ul/li/a/@href') #所有章节链接
count = int(xml.xpath('count(//*[@id="vlink_1"]/ul/li)')) #章节数量
path = 'D:/' + name[0].strip() + '/'
if not os.path.exists(path):
os.makedirs(path)
print(f'开始下载下载【{name[0].strip()}】,本书一共{count}章,请稍等……')
return lists , path
def download_url(list , path , i ,sem):
surl = 'http://www.ting56.com' + list #拼接url
conurl = get_con(surl)
suffix = conurl[-4:] #取后缀名
res = requests.get(conurl).content
print(f'正在下载的第{i} ,当前音频所在url:【{surl}】')
with open(path + str(i) + suffix, 'wb+') as f:
f.write(res)
sem.release() #释放线程
def threads(lists, i = 0 , ths = [] ):
erro = []
path = lists[1]
sem = threading.Semaphore(60) #最大线程数量
for list in lists[0]:
sem.acquire() #取线程,可用线程数减1
i += 1
try:
th = threading.Thread(target=download_url , args=(list , path, i , sem)) #把链接,存储路径、章节名称、线程数量传入
th.start()
ths.append(th)
time.sleep(0.2) #缓一会儿,貌似服务器支撑不了太多的链接!一会儿强迫关闭,一会儿超时!
except :
p = '第' + str(i) + '章下载失败,链接:' + str(list) + '【等待所有进程结束后输出】'
print(f'第{i}章下载失败,链接:{list}')
print('等待所有线程结束后重试!')
erro.append(list)
for t in ths:
t.join()
if erro is not None:
print('下载完毕,下面是下载失败的链接!请自行查看下载')
for el in erro:
print(el)
else:
print('已经全部下载完毕!')
def main(url):
if len(url) == 0:
print('输入网址错误!')
elif 'www.ting56.com' not in url.split('/'):
print('网址错误!')
else:
lists = get_list(url) #获取所有的详情页链接
threads(lists) #开始多线程下载
if __name__ == '__main__' :
url = input('输入图书所在链接')
# url = 'http://www.ting56.com/mp3/21163.html'
main(url)
版权声明
版权说明: 仅限用于学习和研究目的;不得将上述内容用于商业和非法用途!否则一切后果自负。我们非常重视版权问题,如有侵权请邮件至(171373236#qq.com)与我们联系处理,敬请谅解!