使用方法:
修改成你需要的线程和下载器 然后:
python 163pp.py [分辨率] URL1 URL2...
分辨率:
murl 中等
surl 小
lurl 很小很小?
turl 比小还小
qurl 正方形
地址:
https://gist.github.com/cnbeining/17a9a58b4a3d76f72d50
代码下面。
#!/usr/bin/env python #coding:utf-8 # Author: Beining http://www.cnbeining.com/ cnbeining[at]gmail.com # Purpose: Batch download pp.163.com # Created: 03/04/2015 # License: GNU GPL 2.0 https://www.gnu.org/licenses/gpl-2.0.html import os import sys import unittest import urllib2 import logging import re from multiprocessing import Pool from multiprocessing.dummy import Pool as ThreadPool import getopt import subprocess global DOWNLOAD_SOFTWARE, FAKE_HEADER, LOCATION_DIR, resolution FAKE_HEADER = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.16 Safari/537.36', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache'} LOCATION_DIR = os.getcwd() DOWNLOAD_SOFTWARE = 'wget' #---------------------------------------------------------------------- def page_reader(url): """str->str read pages.""" request = urllib2.Request(url, headers=FAKE_HEADER) response = urllib2.urlopen(request) data = response.read() return data #---------------------------------------------------------------------- def page_parser(webpage): """str->dict url:http://pp.163.com/daowuzhe123/pp/13424132.html prpr~""" logging.info('Retriving purl...') for i in webpage.split('\n'): if 'purl' in i: purl = 'http://' + i.strip()[6:-2] #http://s1.ph.126.net/WwP8GD1A3ocjPfENOdgrdQ==/192414543510075.js for i in webpage.split('\n'): if 'name:' in i: folder_name = i.decode('gbk').strip()[7:-2] print(folder_name) break try: os.mkdir(folder_name) except Exception: pass os.chdir(LOCATION_DIR + '/' + folder_name) purl_data = page_reader(purl) purl_processed = purl_data.split('[{')[1].split('}]')[0].split('},{') purl_processed_list =['{' + i + '}' for i in purl_processed] pattern = r"([a-zA-Z_][a-zA-Z_0-9]*)\s*\:" repl = lambda match: '"{}":'.format(match.group(1)) dict_big = {} #print(purl_processed_list) for i in purl_processed_list: #print(i) dict_this = {} dict_this = eval(re.sub(pattern, repl, i)) #print(dict_this) photoId = dict_this['photoId'] #print(photoId) dict_big[photoId] = dict_this return dict_big #---------------------------------------------------------------------- def download_video_link((filename, DOWNLOAD_SOFTWARE, img_url)): """""" logging.info('Downloading #{filename}...'.format(filename = filename)) if DOWNLOAD_SOFTWARE == 'aria2c': cmd = 'aria2c -c -k1M --out {filename} "{img_url}"' elif DOWNLOAD_SOFTWARE == 'wget': cmd = 'wget -c -O {filename} "{img_url}"' elif DOWNLOAD_SOFTWARE == 'curl': cmd = 'curl -L -C -o {filename} "{img_url}"' elif DOWNLOAD_SOFTWARE == 'axel': cmd = 'axel -o {filename} "{img_url}"' cmd = cmd.format(filename = filename, img_url = img_url) logging.debug(cmd) execute_cmd(cmd) #---------------------------------------------------------------------- def execute_cmd(cmd): """""" return_code = subprocess.call(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if return_code == 0: pass else: logging.warning('ERROR') return return_code #---------------------------------------------------------------------- def parse_list(img_dict, resolution): """dict->None""" down_list = [] for i in img_dict: filename = str(img_dict[i]['photoId']) + '.' + img_dict[i][resolution].split('.')[-1] img_url = 'http://img' + img_dict[i][resolution][0] + '.ph.126.net' + img_dict[i][resolution][1:] down_list.append((filename, DOWNLOAD_SOFTWARE, img_url)) return down_list #---------------------------------------------------------------------- def downloader(down_list, workers = 5): """""" from multiprocessing.dummy import Pool as ThreadPool # Make the Pool of workers pool = ThreadPool(int(workers)) # Open the urls in their own threads # and return the results results = pool.map(download_video_link, down_list) #close the pool and wait for the work to finish pool.close() pool.join() #---------------------------------------------------------------------- def main(link, resolution): """""" page_data = page_reader(link) link_dict = page_parser(page_data) down_list = parse_list(link_dict, resolution) downloader(down_list, 5) if __name__=='__main__': resolution = sys.argv[1] argv_list = sys.argv[2:] for link in argv_list: os.chdir(LOCATION_DIR) main(link, resolution) print('Done!')