这几天一直听听评书,发现喜马拉雅上的资源很多,不过很可惜都是付费的,所以我冲了一个月会员,简单写个爬虫,爬下来几10部,够我一年听的了,
一个是同时下载个数,一个是保存路径,剩下的需要调节最下面的整本书的albumId了,另外其中我加入了转换mp3格式的函数,不过我默认注释掉了,如果想使用可以配置ffmpeg到环境变量中去
开始分析
打开chrome控制台,点击播放,最先拿到的一个接口就是
https://mpay.ximalaya.com/mobile/track/pay/244130607/?device=pc
当然这个是付费的一部书,所以如果你浏览器不带 会员的cookie是访问不到的,其中的数字 244130607,这个在他们的接口中叫做 trackId, 每个音频文件对应唯一的一个 trackId
也就是对应这个界面的后面的数字,通过这个唯一的trackId可以获取到音频文件,那么看一下这个接口返回的内容
{ "ret": 0, "msg": "0", "trackId": 244130607, "uid": 170217760, "albumId": 30816438, "title": "《三体》第一季 第十集 聚会与大撕裂", "domain": "http://audiopay.cos.xmcdn.com", "totalLength": 12780565, "sampleDuration": 0, "sampleLength": 0, "isAuthorized": true, "apiVersion": "1.0.0", "seed": 9583, "fileId": "27*31*44*62*1*8*6*48*52*4*6*17*16*6*35*35*6*43*25*27*48*63*58*4*50*47*60*64*15*39*59*49*2*36*48*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*", "buyKey": "617574686f72697a6564", "duration": 1578, "ep": "20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2PjczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f", "highestQualityLevel": 1, "downloadQualityLevel": 1, "authorizedType": 1}
这里,我充会员了,所以可以直接用浏览器中打开这个url,其中有用的字段有了只有几个 seed和 fileId两个通过js加密算法计算出 m4a的路径,并拼接主域名,然后 ep 经过另一个加密算法得到url的访问参数buy_key sign token timestamp,最后将它们拼接到一起才是一个完整的 音频的url
两个js加密算法
经过我调试我分别找到了这两个加密的 js算法
计算 m4a的路径js算法:
function vt(t) { this._randomSeed = t, this.cg_hun() } vt.prototype = { cg_hun: function() { this._cgStr = ""; var t = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890" , e = t.length , n = 0; for (n = 0; n < e; n++) { var r = this.ran() * t.length , o = parseInt(r); this._cgStr += t.charAt(o), t = t.split(t.charAt(o)).join("") } }, cg_fun: function(t) { t = t.split("*"); var e = "" , n = 0; for (n = 0; n < t.length - 1; n++) e += this._cgStr.charAt(t[n]); return e }, ran: function() { this._randomSeed = (211 * this._randomSeed + 30031) % 65536; return this._randomSeed / 65536 }, }; c = function(t, e) { var n = new vt(t).cg_fun(e); return "/" === n[0] ? n : "/".concat(n) }console.log(c(9583,"27*31*44*62*1*8*6*48*52*4*6*17*16*6*35*35*6*43*25*27*48*63*58*4*50*47*60*64*15*39*59*49*2*36*48*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*"))
用node跑一下可以得到 m4a的路径
输出:
/group3/M04/9E/88/wKgMbF4ejn2TfGPRAMMEFYoRHXs027.m4a
通过ep来计算url参数的js算法:
Z = function() { throw new TypeError("Invalid attempt to destructure non-iterable instance") } J = function(t, e) {var n = [] , r = !0 , o = !1 , i = void 0;try { for (var a, u = t[Symbol.iterator](); !(r = (a = u.next()).done) && (n.push(a.value), !e || n.length !== e); r = !0) ; } catch (t) { o = !0, i = t } finally { try { r || null == u.return || u.return() } finally { if (o) throw i } }return n } Q = function(t) {if (Array.isArray(t)) return t } tt = function(t, e) { return Q(t) || J(t, e) || Z() }function yt(t, e) { for (var n, r = [], o = 0, i = "", a = 0; 256 > a; a++) r[a] = a; for (a = 0; 256 > a; a++) o = (o + r[a] + t.charCodeAt(a % t.length)) % 256, n = r[a], r[a] = r[o], r[o] = n; for (var u = o = a = 0; u < e.length; u++) o = (o + r[a = (a + 1) % 256]) % 256, n = r[a], r[a] = r[o], r[o] = n, i += String.fromCharCode(e.charCodeAt(u) ^ r[(r[a] + r[o]) % 256]); return i }var mt = yt("xm", "Ä[üJ=†Û3áf÷N") gt = [19, 1, 4, 7, 30, 14, 28, 8, 24, 17, 6, 35, 34, 16, 9, 10, 13, 22, 32, 29, 31, 21, 18, 3, 2, 23, 25, 27, 11, 20, 5, 15, 12, 0, 33, 26] bt = function(t) {var e1 = yt( function(t, e) { for (var n = [], r = 0; r < t.length; r++) { for (var o = "a" <= t[r] && "z" >= t[r] ? t[r].charCodeAt() - 97 : t[r].charCodeAt() - "0".charCodeAt() + 26, i = 0; 36 > i; i++) if (e[i] == o) { o = i; break } n[r] = 25 < o ? String.fromCharCode(o - 26 + "0".charCodeAt()) : String.fromCharCode(o + 97) } return n.join("") }("d" + mt + "9",gt) , e2 = function(t) { if (!t) return ""; var e, n, r, o, i, a = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1]; for (o = (t = t.toString()).length, r = 0, i = ""; r < o; ) { do { e = a[255 & t.charCodeAt(r++)] } while (r < o && -1 == e);if (-1 == e) break; do { n = a[255 & t.charCodeAt(r++)] } while (r < o && -1 == n);if (-1 == n) break; i += String.fromCharCode(e << 2 | (48 & n) >> 4); do { if (61 == (e = 255 & t.charCodeAt(r++))) return i; e = a[e] } while (r < o && -1 == e);if (-1 == e) break; i += String.fromCharCode((15 & n) << 4 | (60 & e) >> 2); do { if (61 == (n = 255 & t.charCodeAt(r++))) return i; n = a[n] } while (r < o && -1 == n);if (-1 == n) break; i += String.fromCharCode((3 & e) << 6 | n) } return i }(t) ).split("-")console.log(e1) }var c = bt("20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2PjczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f")
这段js比较复杂,调试的时候坑死我了,不在同一个地方,导致我来回复制,最终于才把这个算法整理到这一个js文件中,依然用 node跑一下,输出:
[ '617574686f72697a6564', 'ef9a0678d77870843ef203d6333ce021', '5790', '1598533668' ]
这几个参数分别对应的是:buy_key sign token timestamp
有了这了两个js算法就可以完全的解析 这个接口返回的参数了
python 代码仿写加密算法
1.计算 m4a路径加密算法
class vt(): def __init__(self,t): self._randomSeed = t self.cg_hun() def ran(self): self._randomSeed = (211 * self._randomSeed + 30031) % 65536 return self._randomSeed / 65536 def cg_hun(self): self._cgStr = "" t = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\\:._-1234567890" e = len(t) n = 0 for i in range(e): r = self.ran() * len(t) o = int(r) self._cgStr += t[o] t = "".join(t.split(t[o])) def cg_fun(self,t): t = [int(i) if i else 0 for i in t.split("*")] e = "" n = 0; for n in range(n,len(t)-1): e += self._cgStr[t[n]] return edef path_decode(seed,fileId): c = vt(seed) p = c.cg_fun(fileId) return p if __name__ == '__main__': result = path_decode(9583,"27*31*44*62*1*8*6*48*52*4*6*17*16*6*35*35*6*43*25*27*48*63*58*4*50*47*60*64*15*39*59*49*2*36*48*48*16*58*18*44*2*32*12*7*52*64*51*26*29*4*22*") print(result)
2.通过ep来计算url参数的算法:
def yt(t, e): r = [0 for i in range(256)] o = 0 i = "" for a in range(0,256): r[a] = a; for a in range(0,256): o = (o + r[a] + ord(t[a % len(t)])) % 256 n = r[a] r[a] = r[o] r[o] = n u = 0 o = 0 a = 0 for u in range(0,len(e)): a = (a + 1) % 256 o = (o + r[a]) % 256 n = r[a] r[a] = r[o] r[o] = n i += chr(ord(e[u]) ^ r[(r[a] + r[o]) % 256]) return idef bt(t): def arg1(t,e): n = [' ' for i in range(256)] for r in range(0,len(t)): if "a" <= t[r] and "z" >= t[r]: o = ord(t[r]) - 97 else: o = ord(t[r]) - ord("0") + 26 for i in range(0,36): if (e[i] == o): o = i break if 25< o: n[r] = chr(o - 26 + ord("0")) else: n[r] = chr(o + 97) return "".join(n).strip() a1 = arg1("d" + mt + "9", gt) def arg2(t): if not t: return "" e = n = r = o = i = a = [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1]; o = len(t) i = "" r = 0 while r < o: while True: e = a[255 & ord(t[r])] r += 1 if not (r < o and -1 == e): break if (-1 == e): break while True: n = a[255 & ord(t[r])] r += 1 if not (r < o and -1 == n): break if (-1 == n): break i += chr(e << 2 | (48 & n) >> 4) while True: e = (255 & ord(t[r])) if 61 == e: return i r += 1 e = a[e] if not (r < o and -1 == e): break if (-1 == e): break i += chr((15 & n) << 4 | (60 & e) >> 2); while True: n = (255 & ord(t[r])) if (61 == n): return i r += 1 n = a[n] if not (r < o and -1 == n): break if (-1 == n): break i += chr((3 & e) << 6 | n) return i a2 = arg2(t) buy_key,sign,token,timestamp = yt(a1,a2).split('-') data = dict( buy_key=buy_key, sign=sign, token=token, timestamp=timestamp, ) return data mt = yt("xm", "Ä[üJ=†Û3áf÷N") gt = [19, 1, 4, 7, 30, 14, 28, 8, 24, 17, 6, 35, 34, 16, 9, 10, 13, 22, 32, 29, 31, 21, 18, 3, 2, 23, 25, 27, 11, 20, 5, 15, 12, 0, 33, 26]def ep_decode(ep): data = bt(ep) return dataif __name__ == '__main__': print(ep_decode('20NvOoh6T39X3qwKO4cY5g5bVhg+1nfPHIQafFTmCXihnrqF2PjczO8O0auK1KJhDrJ30XMYfKJo2uz+xgwd3rwRPi5f'))
这个接口到此为止才算是完全可以解析
免费接口分析
如果你没有充会员,免费的音频还是可以听的,我找到一个免费音频的接口
https://www.ximalaya.com/revision/play/v1/audio?id=324681559&ptype=1
返回值:
{ "ret": 200, "data": { "trackId": 324681559, "canPlay": true, "isPaid": false, "hasBuy": true, "src": "https://aod.cos.tx.xmcdn.com/group84/M03/4A/A6/wKg5Hl8s0cTwcp6xABQ0EbeuW5Q193.m4a", "albumIsSample": false, "sampleDuration": 48, "isBaiduMusic": false, "firstPlayStatus": true, "isVipFree": false } }
这个接口还是比较简单的,返回值里面直接包含 m4a音频地址,没有加密措施,另外 url中的数字依然是 trackId,值得一提的是免费音频的trackId不能用在付费接口,我猜测是版本迭代的问题,或者是客户端不同的问题,因为当时我不只是分析网页的接口,还抓包了电脑客户端的接口,具体对应的是网页还是客户端我也忘了
解析整本书的接口
喜马拉雅接口主要关键的有两个参数,一个是前面我说的 trackId 另一个就是albumId,trackId 对应唯一的一个音频,而 albumId 对应的是唯一的一本书。
https://www.ximalaya.com/revision/album/v1/getTracksList?albumId=30816438&pageNum=1&pageSize=1000
返回值中就有每一集的trackId,其实喜马拉雅还有很多其他接口,搜索接口等等,一般的其他的接口需要在请求头中加入xm-sign,我也写了xm-sign的计算方法:
import requestsimport timeimport hashlibimport randomimport jsonfrom requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning)# 获取sign签名def get_sign(headers): serverTimeUrl = "https://www.ximalaya.com/revision/time" response = requests.get(serverTimeUrl,headers=headers,verify=False) serverTime = response.text nowTime = str(round(time.time()*1000)) sign = str(hashlib.md5("himalaya-{}".format(serverTime).encode()).hexdigest()) + "({})".format(str(round(random.random()*100))) + serverTime + "({})".format(str(round(random.random()*100))) + nowTime headers["xm-sign"] = sign return headersdef get_header(): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36" } headers = get_sign(headers) return headersif __name__ == '__main__': # 这是一个搜索接口 url = "https://www.ximalaya.com/revision/search/main?core=all&spellchecker=true&device=iPhone&kw=%E9%9B%AA%E4%B8%AD%E6%82%8D%E5%88%80%E8%A1%8C&page=1&rows=20&condition=relation&fq=&paidFilter=false" s = requests.get(url,headers=get_header(),verify=False) print(s.json())
还有很多其他接口,我就懒得说了,因为我不想写了,有了这些就可以满足我下载整本书的需求了
最终整合
我写了 喜马拉雅 扫码登陆的脚本,因为我不能每次都去复制浏览器中的 cookie,这种重复劳动太傻了
import requestsimport refrom threading import Threadimport timeimport requestsfrom io import BytesIOimport http.cookiejar as cookielibfrom PIL import Imageimport sysimport psutilfrom base64 import b64decodeimport os requests.packages.urllib3.disable_warnings()class show_code(Thread): def __init__(self,data): Thread.__init__(self) self.data = data def run(self): img = Image.open(BytesIO(self.data)) # 打开图片,返回PIL image对象 img.show()def is_login(session): headers = {'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"} url = "https://www.ximalaya.com/revision/main/getCurrentUser" try: session.cookies.load(ignore_discard=True) except Exception: pass response = session.get(url,verify=False,headers=headers) if response.json()['ret'] == 200: print(response.json()) return session,True else: return session,Falsedef login(): if not os.path.exists(".cookie"): os.makedirs('.cookie') if not os.path.exists('.cookie/xmly.txt'): print("hello") with open(".cookie/xmly.txt",'w') as f: f.write("") session = requests.session() session.cookies = cookielib.LWPCookieJar(filename='.cookie/xmly.txt') session,status = is_login(session) if not status: url = "https://passport.ximalaya.com/web/qrCode/gen?level=L" response = session.get(url,verify=False) data = response.json() # with open('qrcode.jpg','wb') as f: # f.write(b64decode(data['img'])) t= show_code(b64decode(data['img'])) t.start() qrId = data['qrId'] url = 'https://passport.ximalaya.com/web/qrCode/check/%s/%s' % (qrId,int(time.time()*1000)) while 1: response = session.get(url,verify=False) data = response.json() # code = re.findall("window.wx_code='(.*?)'",response.text) # sys.exit() if data['ret'] == 0: # for proc in psutil.process_iter(): # 遍历当前process # try: # if proc.name() == "Microsoft.Photos.exe": # proc.kill() # 关闭该process # except Exception as e: # print(e) break time.sleep(1) session.cookies.save() return sessionif __name__ == '__main__': login()
简单的一个扫码登陆脚本,如果cookie自动保存成文件,下次使用的时候直接调用:
session = login()
就能在保持登陆状态下,访问各种接口
版权声明
版权说明: 仅限用于学习和研究目的;不得将上述内容用于商业和非法用途!否则一切后果自负。我们非常重视版权问题,如有侵权请邮件至(171373236#qq.com)与我们联系处理,敬请谅解!