今天抽空写了个多线程下载文件的方法,参考了下网上的案例并做了下修改,支持下载完后根据http头的etag信息做md5校验(当然这个得根据实际情况,etag内容不一定是md5值,不是就把那段代码去掉),后续可以加上断点续传的功能。
代码如下,以供学习
#!/usr/bin/env python #_*_ coding:utf-8 _*_ import threading import urllib2 import time import hashlib import os class MultithreadDownload(threading.Thread): ''' url 请求的url start_position 多线程下载段落的起始位置 end_position 多线程下载段落的结束位置 openfile 打开的文件对象 buf 一次获取数据的buffer值 ''' lock = threading.Lock() def __init__(self, url, start_position, end_position, openfile, buf): self.url = url self.buffer = buf self.start_position = start_position self.end_position = end_position self.fobj = openfile self.offset = 0 threading.Thread.__init__(self) def run(self): req = urllib2.Request(self.url) req.add_header('Range', 'bytes=%s-%s' % (self.start_position, self.end_position)) #print self.start_position,self.end_position request_segment = urllib2.urlopen(req) block = request_segment.read(self.buffer) self.offset = self.start_position while block: self.save(block) # 当这段所有信息都获取到了跳出循环退出 if self.offset >= self.end_position +1: break elif self.offset + self.buffer > self.end_position +1: block = request_segment.read(self.end_position - self.offset +1) self.save(block) break else: block = request_segment.read(self.buffer) def save(self,block): #线程锁,先在文件对象中移动至偏移量处,第一次为段落开始位置,然后写数据,偏移量增加。 MultithreadDownload.lock.acquire() self.fobj.seek(self.offset) self.fobj.write(block) self.offset = self.offset + len(block) MultithreadDownload.lock.release() def download(url,threadnum=5,localfile="/tmp/tmp.txt",buf=4096): while True: fileCheck(localfile) starttime = time.time() req = urllib2.urlopen(url) file_size = int(req.info().getheaders('Content-Length')[0]) md5_remote = req.info().getheaders('ETag')[0].strip("\"") #每个线程处理的每段大小以及最后剩余部分大小 segment_size,remain_size= divmod(file_size, threadnum) with open(localfile,'wb') as openfile: threadlst = [] for i in range(threadnum): start_position = i*segment_size end_position = start_position + segment_size -1 if i == threadnum - 1: end_position = end_position + remain_size begindowload = MultithreadDownload(url, start_position, end_position, openfile, buf) threadlst.append(begindowload) for t in threadlst: t.start() for t in threadlst: t.join() print 'Download completed!' endtime = time.time() usedtime = endtime - starttime print usedtime md5_local = md5sum(localfile) print md5_local,md5_remote if md5_local == md5_remote: print "md5 check ok" break else: print "download file fail md5 error" break def md5sum(localfile): with open(localfile,'rb') as f: m = hashlib.md5() m.update(f.read()) return m.hexdigest() def fileCheck(localfile): if os.path.exists(localfile): os.remove(localfile) if __name__ == '__main__': url = 'https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-5.0.1.rpm' download(url=url,threadnum=10,localfile='test.rpm',buf=4096)
Cloudhu 个人随笔|built by django|
沪ICP备16019452号-1