python爬56听书网的有声小说,网站地址:www.ting56.com,
单线程
import os import requests from lxml import etree import re def get_url(url): #获取源码 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'} res = requests.get(url , headers=headers) res.encoding = res.apparent_encoding if res.status_code == 404: print('获取源码失败,退出程序') os._exit(0) else: return res.text def get_con(url): xml = get_url(url) ssrl = re.search('var datas=\S*', xml)[0].split('*') #正则获取加密链接数组并且按照*号分割 murl = '' for i in range(1,int(len(ssrl)) - 1): #遍历获取的数字并转换成ASCII码然后拼接链接! s = ssrl[i] p = chr(int(s)) murl += p ms = murl.split('&') #分割拼接出来的链接,取得包含音频链接的的列表 return ms def get_list(url): res = get_url(url) list999 = etree.HTML(res) #转换xpath name = list999.xpath('//div[@class="tit"]/h1/text()') #书名 lists = list999.xpath('//*[@id="vlink_1"]/ul/li/a/@href') #所有章节链接 count = int(list999.xpath('count(//*[@id="vlink_1"]/ul/li)')) #章节数量 path = 'D:/' + name[0].strip() + '/' if not os.path.exists(path): os.makedirs(path) i = 1 for list in lists: surl = 'http://www.ting56.com' + list #拼接url conlist = get_con(surl) conurl = get_con(surl) suffix = conurl[0][-4:] #取后缀名 res = requests.get(conurl[0]).content print(f'正在下载【{name[0].strip()}】的第{i}/{count}章,当前音频所在url:{surl}') with open(path + str(i) + suffix, 'wb+') as f: f.write(res) i += 1 # url = input('输入56听书网图书目录网址:') url = input('输入图书所在链接') # url = 'http://www.ting56.com/mp3/21163.html' get_list(url)
多线程
import os import re import threading import time import requests from lxml import etree def get_url(url): #获取源码 headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'} res = requests.get(url , headers=headers) res.encoding = res.apparent_encoding if res.status_code == 404: print('获取源码失败,退出程序') os._exit(0) else: return res.text def get_con(url): xml = get_url(url) ssrl = re.search('var datas=\S*', xml)[0].split('*') #正则获取加密链接数组并且按照*号分割 murl = '' for i in range(1,int(len(ssrl)) - 1): #遍历获取的数字并转换成 Unicode 值然后拼接! s = ssrl[i] p = chr(int(s)) murl += p ms = murl.split('&') #分割拼接出来的链接,取得包含音频链接的的列表 return ms[0] def get_list(url): res = get_url(url) xml = etree.HTML(res) #转换xpath name = xml.xpath('//div[@class="tit"]/h1/text()') #书名 lists = xml.xpath('//*[@id="vlink_1"]/ul/li/a/@href') #所有章节链接 count = int(xml.xpath('count(//*[@id="vlink_1"]/ul/li)')) #章节数量 path = 'D:/' + name[0].strip() + '/' if not os.path.exists(path): os.makedirs(path) print(f'开始下载下载【{name[0].strip()}】,本书一共{count}章,请稍等……') return lists , path def download_url(list , path , i ,sem): surl = 'http://www.ting56.com' + list #拼接url conurl = get_con(surl) suffix = conurl[-4:] #取后缀名 res = requests.get(conurl).content print(f'正在下载的第{i} ,当前音频所在url:【{surl}】') with open(path + str(i) + suffix, 'wb+') as f: f.write(res) sem.release() #释放线程 def threads(lists, i = 0 , ths = [] ): erro = [] path = lists[1] sem = threading.Semaphore(60) #最大线程数量 for list in lists[0]: sem.acquire() #取线程,可用线程数减1 i += 1 try: th = threading.Thread(target=download_url , args=(list , path, i , sem)) #把链接,存储路径、章节名称、线程数量传入 th.start() ths.append(th) time.sleep(0.2) #缓一会儿,貌似服务器支撑不了太多的链接!一会儿强迫关闭,一会儿超时! except : p = '第' + str(i) + '章下载失败,链接:' + str(list) + '【等待所有进程结束后输出】' print(f'第{i}章下载失败,链接:{list}') print('等待所有线程结束后重试!') erro.append(list) for t in ths: t.join() if erro is not None: print('下载完毕,下面是下载失败的链接!请自行查看下载') for el in erro: print(el) else: print('已经全部下载完毕!') def main(url): if len(url) == 0: print('输入网址错误!') elif 'www.ting56.com' not in url.split('/'): print('网址错误!') else: lists = get_list(url) #获取所有的详情页链接 threads(lists) #开始多线程下载 if __name__ == '__main__' : url = input('输入图书所在链接') # url = 'http://www.ting56.com/mp3/21163.html' main(url)
版权声明
版权说明: 仅限用于学习和研究目的;不得将上述内容用于商业和非法用途!否则一切后果自负。我们非常重视版权问题,如有侵权请邮件至(171373236#qq.com)与我们联系处理,敬请谅解!