关于python：与请求异步有效地下载文件

Efficiently download files asynchronously with requests

我想用python尽可能快地下载文件。这是我的代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

import pandas as pd
import requests
from requests_futures.sessions import FuturesSession
import os
import pathlib
from timeit import default_timer as timer

class AsyncDownloader:
"""Download files asynchronously"""

__urls = set()
__dest_path = None
__user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
__read_timeout = 60
__connection_timeout = 30
__download_count = 0 # unlimited
# http://www.browserscope.org/?category=network
__worker_count = 17 # No of threads to spawn
__chunk_size = 1024
__download_time = -1
__errors = []

# TODO Fetch only content of a specific type from a csv
# TODO Improve code structure so that it can be used as a commandline tool

def set_source_csv(self, source_path, column_name):
self.source_path = source_path
self.column_name = column_name

try:
my_csv = pd.read_csv(source_path, usecols=[self.column_name], chunksize=10)
except ValueError:
print("The column name doesn't exist")
return
else:
# No exception whatsoever
for chunk in my_csv:
AsyncDownloader.__urls.update(set(getattr(chunk, self.column_name)))

def set_destination_path(self, dest_path):
if dest_path.endswith('/'):
dest_path = dest_path[:-1]
self.dest_path = dest_path
# TODO Add exception in case we can't create the directory
pathlib.Path(self.dest_path).mkdir(parents=True, exist_ok=True)
if os.access(self.dest_path, os.W_OK):
AsyncDownloader.__dest_path = pathlib.Path(self.dest_path).resolve()

def set_user_agent(self, useragent):
self.useragent = useragent
AsyncDownloader.__user_agent = self.useragent

def set_connection_timeout(self, ctimeout_secs):
self.timeout_secs = ctimeout_secs
if self.timeout_secs >= 0:
AsyncDownloader.__connection_timeout = self.timeout_secs

def set_read_timeout(self, rtimeout_secs):
self.timeout_secs = rtimeout_secs
if self.timeout_secs >= 0:
AsyncDownloader.__read_timeout = self.timeout_secs

def set_download_count(self, file_count):
self.file_count = file_count
if self.file_count > 0:
AsyncDownloader.__download_count = self.file_count

def set_worker_count(self, worker_count):
self.worker_count = worker_count
if self.worker_count > 0:
AsyncDownloader.__worker_count = self.worker_count

def set_chunk_size(self, chunk_size):
self.chunk_size = chunk_size
if self.chunk_size > 0:
AsyncDownloader.__chunk_size = self.chunk_size

def print_urls(self):
print(AsyncDownloader.__urls)

def get_download_time(self):
return AsyncDownloader.__download_time

def get_errors(self):
return AsyncDownloader.__errors

def download(self):
start = timer()
try:
session = FuturesSession(max_workers=AsyncDownloader.__worker_count)
session.headers.update({'user-agent': AsyncDownloader.__user_agent})
session.request(AsyncDownloader.__connection_timeout,
AsyncDownloader.__connection_timeout, stream=True)

results = []
# Give an accurate file count even if we don't have to download it as it a;ready exist
file_count = 0

for url in AsyncDownloader.__urls:
filename = os.path.basename(url)
# check if we need only a limited number of files
if AsyncDownloader.__download_count != 0:
# No need to download file if it already exist
if pathlib.Path(AsyncDownloader.__dest_path / filename).is_file():
file_count += 1
continue
else:
if file_count < AsyncDownloader.__download_count:
file_count += 1
results.append(session.get(url))
else:
if not pathlib.Path(AsyncDownloader.__dest_path / filename).is_file():
results.append(session.get(url))

for result in results:
# wait for the response to complete, if it hasn't already
response = result.result()
filename = os.path.basename(response.url)
if response.status_code == 200:
with open(pathlib.Path(AsyncDownloader.__dest_path / filename).resolve(), 'wb') as fd:
for chunk in response.iter_content(chunk_size=AsyncDownloader.__chunk_size):
if chunk: # filter out keep-alive new chunks
fd.write(chunk)

end = timer()
AsyncDownloader.__download_time = end - start

except requests.exceptions.HTTPError as errh:
AsyncDownloader.__errors.append("Http Error:" + errh)
# print("Http Error:", errh)
except requests.exceptions.ConnectionError as errc:
AsyncDownloader.__errors.append("Error Connecting:" + errc)
# print("Error Connecting:", errc)
except requests.exceptions.Timeout as errt:
AsyncDownloader.__errors.append("Timeout Error:" + errt)
# print("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
AsyncDownloader.__errors.append("OOps: Something Else" + err)
else:
return

下面的代码做了一个非常糟糕的假设。事实上，我假设第一个网址将首先完成，当然这是不正确的。

1 2	# wait for the response to complete, if it hasn't already response = result.result()

如何确保只处理已完成的请求，而不是以有效的方式采取上述假设？

我将不胜感激任何有关如何提高性能的建议。

亲切的问候

相关讨论

即使连接按顺序完成，您仍然按顺序处理文件。第二个文件必须等待第一个文件写入，依此类推。因此，你可以做的最好的事情是并行处理所有内容(尽管有GIL，这可以完成，因为像写入磁盘和从网络读取等操作会释放它)。基本上，使用常规的requests库(不是requests-futures)并为每个请求创建一个future / thread +文件处理。

还有更多的方法可以让它更快，比如在写入时继续下载块(即两个线程，一个用于请求，一个用于文件处理)。并通过发出multi-part请求来并行读取块，这是"下载加速器"的区域，您可能不希望代码中出现这种复杂性。

编辑：此外，分块下载是懒惰的，这意味着您只是并行地进行初始请求，但实际的分块文件下载是按顺序进行的，因为它是在主线程中完成的。因此，您当前的方法并不比完全同步的方法好多少。上述建议仍然有效。

最简单的方法是不需要任何线程或特殊的异步代码：只需使用常规的requests库及其内置的流式选项。您说response = session.get(url, stream=True)然后使用response.iter_content(chunk_size=1024)(例如)一次访问一个块的下载信息。这是一个功能性的例子：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

import requests
import os

def stream_multiple(urls):
responses = {url: requests.get(url, stream=True) for url in urls)
streams = {url: responses[url].iter_content(chunk_size=1024)
for url in urls}
handles = {url: open(os.path.basename(url), 'wb') for url in urls}
while streams:
for url in list(streams.keys()):
try:
chunk = next(streams[url])
print("Received {} bytes for {}".format(len(chunk), url))
handles[url].write(chunk)
except StopIteration: # no more contenet
handles[url].close()
streams.pop(url)

样本输出：

1
2
3
4
5
6
7
8
9
10
11

rat@pandion:~/tmp$ python smu.py
Received 1296 bytes for http://www.gutenberg.org/files/9490/9490-0.txt
Received 1882 bytes for http://www.gutenberg.org/ebooks/21497.txt.utf-8
Received 1524 bytes for http://www.gutenberg.org/files/1729/1729-0.txt
Received 1508 bytes for http://www.gutenberg.org/ebooks/21790.txt.utf-8
Received 1826 bytes for http://www.gutenberg.org/files/9490/9490-0.txt
Received 2349 bytes for http://www.gutenberg.org/ebooks/21497.txt.utf-8
Received 1834 bytes for http://www.gutenberg.org/files/1729/1729-0.txt
Received 1838 bytes for http://www.gutenberg.org/ebooks/21790.txt.utf-8
Received 2009 bytes for http://www.gutenberg.org/files/9490/9490-0.txt
...

使用线程或多处理可能会略微提高性能，但我怀疑这会更好。几乎在所有情况下，将数据写入磁盘的速度远远快于从网络接收数据。