Efficiently download files asynchronously with requests
我想用python尽可能快地下载文件。这是我的代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import pandas as pd import requests from requests_futures.sessions import FuturesSession import os import pathlib from timeit import default_timer as timer class AsyncDownloader: """Download files asynchronously""" __urls = set() __dest_path = None __user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0' __read_timeout = 60 __connection_timeout = 30 __download_count = 0 # unlimited # http://www.browserscope.org/?category=network __worker_count = 17 # No of threads to spawn __chunk_size = 1024 __download_time = -1 __errors = [] # TODO Fetch only content of a specific type from a csv # TODO Improve code structure so that it can be used as a commandline tool def set_source_csv(self, source_path, column_name): self.source_path = source_path self.column_name = column_name try: my_csv = pd.read_csv(source_path, usecols=[self.column_name], chunksize=10) except ValueError: print("The column name doesn't exist") return else: # No exception whatsoever for chunk in my_csv: AsyncDownloader.__urls.update(set(getattr(chunk, self.column_name))) def set_destination_path(self, dest_path): if dest_path.endswith('/'): dest_path = dest_path[:-1] self.dest_path = dest_path # TODO Add exception in case we can't create the directory pathlib.Path(self.dest_path).mkdir(parents=True, exist_ok=True) if os.access(self.dest_path, os.W_OK): AsyncDownloader.__dest_path = pathlib.Path(self.dest_path).resolve() def set_user_agent(self, useragent): self.useragent = useragent AsyncDownloader.__user_agent = self.useragent def set_connection_timeout(self, ctimeout_secs): self.timeout_secs = ctimeout_secs if self.timeout_secs >= 0: AsyncDownloader.__connection_timeout = self.timeout_secs def set_read_timeout(self, rtimeout_secs): self.timeout_secs = rtimeout_secs if self.timeout_secs >= 0: AsyncDownloader.__read_timeout = self.timeout_secs def set_download_count(self, file_count): self.file_count = file_count if self.file_count > 0: AsyncDownloader.__download_count = self.file_count def set_worker_count(self, worker_count): self.worker_count = worker_count if self.worker_count > 0: AsyncDownloader.__worker_count = self.worker_count def set_chunk_size(self, chunk_size): self.chunk_size = chunk_size if self.chunk_size > 0: AsyncDownloader.__chunk_size = self.chunk_size def print_urls(self): print(AsyncDownloader.__urls) def get_download_time(self): return AsyncDownloader.__download_time def get_errors(self): return AsyncDownloader.__errors def download(self): start = timer() try: session = FuturesSession(max_workers=AsyncDownloader.__worker_count) session.headers.update({'user-agent': AsyncDownloader.__user_agent}) session.request(AsyncDownloader.__connection_timeout, AsyncDownloader.__connection_timeout, stream=True) results = [] # Give an accurate file count even if we don't have to download it as it a;ready exist file_count = 0 for url in AsyncDownloader.__urls: filename = os.path.basename(url) # check if we need only a limited number of files if AsyncDownloader.__download_count != 0: # No need to download file if it already exist if pathlib.Path(AsyncDownloader.__dest_path / filename).is_file(): file_count += 1 continue else: if file_count < AsyncDownloader.__download_count: file_count += 1 results.append(session.get(url)) else: if not pathlib.Path(AsyncDownloader.__dest_path / filename).is_file(): results.append(session.get(url)) for result in results: # wait for the response to complete, if it hasn't already response = result.result() filename = os.path.basename(response.url) if response.status_code == 200: with open(pathlib.Path(AsyncDownloader.__dest_path / filename).resolve(), 'wb') as fd: for chunk in response.iter_content(chunk_size=AsyncDownloader.__chunk_size): if chunk: # filter out keep-alive new chunks fd.write(chunk) end = timer() AsyncDownloader.__download_time = end - start except requests.exceptions.HTTPError as errh: AsyncDownloader.__errors.append("Http Error:" + errh) # print("Http Error:", errh) except requests.exceptions.ConnectionError as errc: AsyncDownloader.__errors.append("Error Connecting:" + errc) # print("Error Connecting:", errc) except requests.exceptions.Timeout as errt: AsyncDownloader.__errors.append("Timeout Error:" + errt) # print("Timeout Error:", errt) except requests.exceptions.RequestException as err: AsyncDownloader.__errors.append("OOps: Something Else" + err) else: return |
下面的代码做了一个非常糟糕的假设。事实上,我假设第一个网址将首先完成,当然这是不正确的。
1 2 | # wait for the response to complete, if it hasn't already response = result.result() |
如何确保只处理已完成的请求,而不是以有效的方式采取上述假设?
我将不胜感激任何有关如何提高性能的建议。
亲切的问候
即使连接按顺序完成,您仍然按顺序处理文件。第二个文件必须等待第一个文件写入,依此类推。因此,你可以做的最好的事情是并行处理所有内容(尽管有GIL,这可以完成,因为像写入磁盘和从网络读取等操作会释放它)。基本上,使用常规的
还有更多的方法可以让它更快,比如在写入时继续下载块(即两个线程,一个用于请求,一个用于文件处理)。并通过发出
编辑:此外,分块下载是懒惰的,这意味着您只是并行地进行初始请求,但实际的分块文件下载是按顺序进行的,因为它是在主线程中完成的。因此,您当前的方法并不比完全同步的方法好多少。上述建议仍然有效。
最简单的方法是不需要任何线程或特殊的异步代码:只需使用常规的
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | import requests import os def stream_multiple(urls): responses = {url: requests.get(url, stream=True) for url in urls) streams = {url: responses[url].iter_content(chunk_size=1024) for url in urls} handles = {url: open(os.path.basename(url), 'wb') for url in urls} while streams: for url in list(streams.keys()): try: chunk = next(streams[url]) print("Received {} bytes for {}".format(len(chunk), url)) handles[url].write(chunk) except StopIteration: # no more contenet handles[url].close() streams.pop(url) |
样本输出:
1 2 3 4 5 6 7 8 9 10 11 | rat@pandion:~/tmp$ python smu.py Received 1296 bytes for http://www.gutenberg.org/files/9490/9490-0.txt Received 1882 bytes for http://www.gutenberg.org/ebooks/21497.txt.utf-8 Received 1524 bytes for http://www.gutenberg.org/files/1729/1729-0.txt Received 1508 bytes for http://www.gutenberg.org/ebooks/21790.txt.utf-8 Received 1826 bytes for http://www.gutenberg.org/files/9490/9490-0.txt Received 2349 bytes for http://www.gutenberg.org/ebooks/21497.txt.utf-8 Received 1834 bytes for http://www.gutenberg.org/files/1729/1729-0.txt Received 1838 bytes for http://www.gutenberg.org/ebooks/21790.txt.utf-8 Received 2009 bytes for http://www.gutenberg.org/files/9490/9490-0.txt ... |
使用线程或多处理可能会略微提高性能,但我怀疑这会更好。几乎在所有情况下,将数据写入磁盘的速度远远快于从网络接收数据。
为了处理您的代码,我创建了一个
调试后,第一个结果
1 | response = result.result() |
是(按此顺序):UDemy,YouTube,GitHub。
为了记录,每个
这意味着开始时没有问题,尽管我按特定顺序设置
I would appreciate any other suggestion on how to improve performance.
至于性能,您可以通过创建用于将响应写入文件的线程或使用异步IO库(例如Tinche / aiofiles)来提高速度。
如果你想更进一步,可以尝试通过使用Python的替代实现来提高程序本身的性能,例如PyPy
如果你不担心"猴子补丁"你可以使用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | import gevent.monkey import requests CONNECTIONS = 10 gevent.monkey.patch_all() # debug in PyCharm: https://blog.jetbrains.com/pycharm/2012/08/gevent-debug-support/ import gevent.pool def your_request_without_any_changes(url): return requests.get(url) pool = gevent.pool.Pool(CONNECTIONS) for response in pool.imap_unordered(your_request_without_any_changes, ['http://www.google.com'] * 100): print(response.status_code) |