B站的API各种逗比。我很悲伤。
最近弹幕服务器做了变化,但是暂时不想更新。
先修正一下API服务器对于P=1时报错的问题。虽然好像不影响下载。因为我考虑了大量的exceptions。
其实是hotfix,但是我也找不到其他好的解决办法。
老地方,https://gist.github.com/cnbeining/9605757
代码藏起来以免影响加载。
''' Biligrab 0.72 Beining@ACICFG cnbeining[at]gmail.com http://www.cnbeining.com MIT licence ''' import sys import os from StringIO import StringIO import gzip import urllib2 import sys import commands from xml.dom.minidom import parse, parseString import xml.dom.minidom reload(sys) sys.setdefaultencoding('utf-8') global vid global cid global partname global title global videourl global part_now def list_del_repeat(list): """delete repeating items in a list, and keep the order. http://www.cnblogs.com/infim/archive/2011/03/10/1979615.html""" l2 = [] [l2.append(i) for i in list if not i in l2] return(l2) #---------------------------------------------------------------------- def find_cid_api(vid, p): """find cid and print video detail""" global cid global partname global title global videourl cid = 0 title = '' partname = '' if str(p) is '0' or str(p) is '1': biliurl = 'http://api.bilibili.tv/view?type=xml&appkey=876fe0ebd0e67a0f&id=' + str(vid) else: biliurl = 'http://api.bilibili.tv/view?type=xml&appkey=876fe0ebd0e67a0f&id=' + str(vid) + '&page=' + str(p) videourl = 'http://www.bilibili.tv/video/av'+ str(vid)+'/index_'+ str(p)+'.html' print('Fetching webpage...') try: request = urllib2.Request(biliurl, headers={ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) response = urllib2.urlopen(request) data = response.read() dom = parseString(data) for node in dom.getElementsByTagName('cid'): if node.parentNode.tagName == "info": cid = node.toxml()[5:-6] print('cid is ' + cid) break for node in dom.getElementsByTagName('partname'): if node.parentNode.tagName == "info": partname = node.toxml()[10:-11].strip() print('partname is ' + partname) break for node in dom.getElementsByTagName('title'): if node.parentNode.tagName == "info": title = node.toxml()[7:-8].strip() print('Title is ' + title) except: #If API failed print('ERROR: Cannot connect to API server!') #---------------------------------------------------------------------- def find_cid_flvcd(videourl): """""" global vid global cid global partname global title print('Fetching webpage via Flvcd...') request = urllib2.Request(videourl, headers={ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) request.add_header('Accept-encoding', 'gzip') response = urllib2.urlopen(request) if response.info().get('Content-Encoding') == 'gzip': buf = StringIO( response.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() data_list = data.split('\n') #Todo: read title for lines in data_list: if 'cid=' in lines: cid = lines.split('&') cid = cid[0].split('=') cid = cid[-1] print('cid is ' + str(cid)) break #---------------------------------------------------------------------- def find_link_flvcd(videourl): """""" request = urllib2.Request('http://www.flvcd.com/parse.php?kw='+videourl, headers={ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) request.add_header('Accept-encoding', 'gzip') response = urllib2.urlopen(request) data = response.read() data_list = data.split('\n') for items in data_list: if 'name' in items and 'inf' in items and 'input' in items: c = items rawurlflvcd = c[39:-5] rawurlflvcd = rawurlflvcd.split('|') return rawurlflvcd #---------------------------------------------------------------------- def main(vid, p, oversea): global cid global partname global title global videourl global is_first_run biliurl = 'http://api.bilibili.tv/view?type=xml&appkey=876fe0ebd0e67a0f&id=' + str(vid) + '&page=' + str(p) videourl = 'http://www.bilibili.tv/video/av'+ str(vid)+'/index_'+ str(p)+'.html' output = commands.getstatusoutput('ffmpeg --help') if str(output[0]) == '32512': print('FFmpeg does not exist! Trying to get you a binary, need root...') os.system('sudo curl -o /usr/bin/ffmpeg https://raw.githubusercontent.com/superwbd/ABPlayerHTML5-Py--nix/master/ffmpeg') output = commands.getstatusoutput('aria2c --help') if str(output[0]) == '32512': print('aria2c does not exist! Trying to get you a binary, need root... Thanks for @MartianZ \'s work.') os.system('sudo curl -o /usr/bin/aria2c https://raw.githubusercontent.com/MartianZ/fakeThunder/master/fakeThunder/aria2c') find_cid_api(vid, p) global cid if cid is 0: print('Cannot find cid, trying to do it brutely...') find_cid_flvcd(videourl) if cid is 0: is_black3 = str(raw_input('Strange, still cannot find cid... Type y for trying the unpredictable way, or input the cid by yourself, press ENTER to quit.')) if 'y' in str(is_black3): vid = vid - 1 p = 1 find_cid_api(vid-1, p) cid = cid + 1 elif str(is_black3) is '': print('Cannot get cid anyway! Quit.') exit() else: cid = str(is_black3) #start to make folders... if title is not '': folder = title else: folder = cid if len(partname) is not 0: filename = partname elif title is not '': filename = title else: filename = cid # In case make too much folders folder_to_make = os.getcwd() + '/' + folder if is_first_run == 0: if not os.path.exists(folder_to_make): os.makedirs(folder_to_make) is_first_run = 1 os.chdir(folder_to_make) print('Fetching XML...') os.system('curl -o "'+filename+'.xml" --compressed http://comment.bilibili.cn/'+cid+'.xml') os.system('gzip -d '+cid+'.xml.gz') print('The XML file, ' + filename + '.xml should be ready...enjoy!') print('Finding video location...') #try api if oversea == '1': try: request = urllib2.Request('http://interface.bilibili.cn/v_cdn_play?cid='+cid, headers={ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) except: print('ERROR: Cannot connect to CDN API server!') elif oversea is '2': #Force get oriurl try: request = urllib2.Request('http://interface.bilibili.com/player?id=cid:'+cid, headers={ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) except: print('ERROR: Cannot connect to original source API server!') else: try: request = urllib2.Request('http://interface.bilibili.tv/playurl?cid='+cid, headers={ 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache' }) except: print('ERROR: Cannot connect to normal API server!') response = urllib2.urlopen(request) data = response.read() #print(data_list) rawurl = [] originalurl = '' if oversea is '2': data = data.split('\n') for l in data: if 'oriurl' in l: originalurl = str(l[8:-9]) print('Original URL is ' + originalurl) break if originalurl is not '': rawurl = find_link_flvcd(originalurl) else: print('Cannot get original URL! Using falloff plan...') pass else: dom = parseString(data) for node in dom.getElementsByTagName('url'): if node.parentNode.tagName == "durl": rawurl.append(node.toxml()[14:-9]) #print(str(node.toxml()[14:-9])) pass if rawurl is []: #hope this never happen rawurl = find_link_flvcd(videourl) #flvcd #print(rawurl) vid_num = len(rawurl) if vid_num is 0: print('Cannot get download URL!') exit() #print(rawurl) print(str(vid_num) + ' videos in part ' + str(part_now) + ' to download, fetch yourself a cup of coffee...') for i in range(vid_num): print('Downloading ' + str(i+1) + ' of ' + str(vid_num) + ' videos in part ' + str(part_now) + '...') #print('aria2c -llog.txt -c -s16 -x16 -k1M --out '+str(i)+'.flv "'+rawurl[i]+'"') os.system('aria2c -c -s16 -x16 -k1M --out '+str(i)+'.flv "'+rawurl[i]+'"') #os.system('aria2c -larialog.txt -c -s16 -x16 -k1M --out '+str(i)+'.flv "'+rawurl[i]+'"') #not debugging, not fun. f = open('ff.txt', 'w') ff = '' os.getcwd() for i in range(vid_num): ff = ff + 'file \'' + str(os.getcwd()) + '/'+ str(i) + '.flv\'\n' ff = ff.encode("utf8") f.write(ff) f.close() print('Concating videos...') os.system('ffmpeg -f concat -i ff.txt -c copy "'+filename+'".mp4') os.system('rm -r ff.txt') for i in range(vid_num): os.system('rm -r '+str(i)+'.flv') print('Done, enjoy yourself!') # vid = str(raw_input('av')) p_raw = str(raw_input('P')) oversea = str(input('Source?')) p_list = [] p_raw = p_raw.split(',') for item in p_raw: if '~' in item: #print(item) lower = 0 higher = 0 item = item.split('~') try: lower = int(item[0]) except: print('Cannot read lower!') try: higher = int(item[1]) except: print('Cannot read higher!') if lower == 0 or higher == 0: if lower == 0 and higher != 0: lower = higher elif lower != 0 and higher == 0: higher = lower else: print('Cannot find any higher or lower, ignoring...') #break mid = 0 if higher < lower: mid = higher higher = lower lower = mid p_list.append(lower) while lower < higher: lower = lower + 1 p_list.append(lower) #break else: try: p_list.append(int(item)) except: print('Cannot read "'+str(item)+'", abondon it.') #break p_list = list_del_repeat(p_list) global is_first_run is_first_run = 0 part_now = '0' print(p_list) for p in p_list: reload(sys) sys.setdefaultencoding('utf-8') part_now = str(p) main(vid, p, oversea) exit() ''' data_list = data.split('\r') for lines in data_list: lines = str(lines) if '<url>' in lines: if 'youku' in lines: url = lines[17:-9] elif 'sina' in lines: url = lines[16:-9] elif 'qq.com' in lines: url = lines[17:-9] elif 'letv.com' in lines: url = lines[17:-9] break elif 'acgvideo' in lines: url = lines[17:-9] is_local = 1 rawurl.append(url) if 'backup_url' in lines and is_local is 1: break'''
我好像看到你多处忘记转义了……
就一个curl那里应该弄一下。但是不碍事。。。其他地方执行命令应该不涉及变动。
当然不是那里。
是109行:
request = urllib2.Request(‘http://www.flvcd.com/parse.php?kw=’+videourl, headers=…)
改成
request = urllib2.Request(‘http://www.flvcd.com/parse.php?+urllib.urlencode([(‘kw’, videourl)]), headers=…)
或者(如果参数个数不太多)
request = urllib2.Request(‘http://www.flvcd.com/parse.php?kw=’+urllib.quote_plus(videourl), headers=…)
想想看,如果有人把你的代码用在生产环境上就是天大的漏洞啊。可以用未转义的 & 逃脱参数检查,或者构造 CRLF 攻击等。
试想自己发布的每一段开源代码都有可能被用在别的项目中。比如我的 bilidan 中已经内嵌 biligrab 引擎了。都要对用户负责不是嘛?
po主距离把程序写得Pythonic这个目标还有点远呐 ╮( ̄_ ̄”)╭
嘛 因为不考虑出本地,而且videourl这个是写死的,就没上心。
如果需要,前面也需要对av,p进行检查,看看能不能直接转成int,不能代表有鬼。
下个版本修下。。。还有很多要学的东西。。。。
参数读取是非常建议用 argparse 模块的。
手工询问参数不容易批处理啊。
如果不知道怎么用,可以看看我修改的 bilidan 最后一小段。
实际上 P1~9 这种批量下载的功能不需要费太多工夫支持。
因为用户可以在bash里这样:
for i in `seq 1 9`
do
./biligrab.py xxxxxx/index_$i.html
done
或者这样,如果是zsh的话,
./biligrab.py xxxxxx/index_[1-9].html
你看我的其他一些脚本就在命令行里面收参数。
这个脚本的要求是,用户的界面不是zsh,也不可能为一个简单操作专门写一个bash:因为我下载单个视频,一个视频的多个P,多个视频的单个P都很多。(bash:我特么要吐。。。
还有个小问题:使用时,我不想每次重新敲’./******.py’ ,而是直接按上箭头用历史。要是在命令行里面收参数,需要把之前的东西先删了,然后敲新的。多麻烦。。。在程序里面取就不会出这个坑问题。
当然当然当然当然,要是拎出来,整个main()函数是可以自己跑的。可以单独拎出这个函数,然后用argv,这个不难,但是我觉得做起来不是极其有意义,而且我把flvcd模块单独抽出来又做了个脚本专门批量下其他网站,就是为了批处理,那时就和你想的一样了。
具体原因。。。这个得看这个脚本的演化历史了。。。。。怎么从一个下弹幕的小东西变成一个啥都有的脚本。。。
明白了。我错认为用户都愿意打三行字来启动 biligrab。毕竟 bash 的 for 语句至少要按三个回车……
另外我想知道 flvcd 有什么黑科技能解析出 biligrab 自己不能解析的结果呢?好神奇。不知道 Beining 大大有没有研究过。
来点轻松的:
「具体原因。。。这个得看这个脚本的演化历史了。。。。。怎么从一个下弹幕的小东西变成一个啥都有的脚本。。。」这句话让我想到了典型的 software bloating,根据 Jamie Zawinski 的理论,几年后 biligrab 应该可以收邮件了,等几年看看是不是真的吧,反正 Firefox、Emacs、Blender、SAO Utils 之类的都印证了。(笑)
其实现在也是3个回车,但是敲的内容少得多啊。。。
为啥有个Flvcd呢。。。因为一开始(你找找0.2,0.3左右的版本),我没有用API。
也就是,完全用flvcd。所以就留下了这个东西。
Flvcd现在的存在有几个用途:1)万一API真抽风了,Flvcd可以挡一下,不至于整个程序立马趴窝 2)我加了个强制解析原始源,这个用了flvcd,例如,强制用sina而不是letv的备份。
我去问,flvcd不告诉我,说这个是技术机密。
我认为,flvcd其实用的就是CDN的API。因为IP问题,我们不能指望结果完全一样。但是和我们设置source=1,用他们的IP效果应该一样。
我一开始是想把AcDown做一个OSX下的版本,因为我实在不想用wine。ACPlay我用两个弹幕播放器解决了,下载部分基本上就是这个+you-get了。
当然接下来扒了所有的黑科技,最后到给AcDown提交bug。。。。。。这真的很神奇啊(笑
说不定下个版本我就加上出现bug给我发邮件提交错误记录的功能呢。。。。。。。。。