关于python:与请求异步有效地下载文件

Efficiently download files asynchronously with requests

我想用python尽可能快地下载文件。这是我的代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pandas as pd
import requests
from requests_futures.sessions import FuturesSession
import os
import pathlib
from timeit import default_timer as timer


class AsyncDownloader:
   """Download files asynchronously"""

    __urls = set()
    __dest_path = None
    __user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0'
    __read_timeout = 60
    __connection_timeout = 30
    __download_count = 0  # unlimited
    # http://www.browserscope.org/?category=network
    __worker_count = 17  # No of threads to spawn
    __chunk_size = 1024
    __download_time = -1
    __errors = []

    # TODO Fetch only content of a specific type from a csv
    # TODO Improve code structure so that it can be used as a commandline tool

    def set_source_csv(self, source_path, column_name):
        self.source_path = source_path
        self.column_name = column_name

        try:
            my_csv = pd.read_csv(source_path, usecols=[self.column_name], chunksize=10)
        except ValueError:
            print("The column name doesn't exist")
            return
        else:
            # No exception whatsoever
            for chunk in my_csv:
                AsyncDownloader.__urls.update(set(getattr(chunk, self.column_name)))

    def set_destination_path(self, dest_path):
        if dest_path.endswith('/'):
            dest_path = dest_path[:-1]
        self.dest_path = dest_path
        # TODO Add exception in case we can't create the directory
        pathlib.Path(self.dest_path).mkdir(parents=True, exist_ok=True)
        if os.access(self.dest_path, os.W_OK):
            AsyncDownloader.__dest_path = pathlib.Path(self.dest_path).resolve()

    def set_user_agent(self, useragent):
        self.useragent = useragent
        AsyncDownloader.__user_agent = self.useragent

    def set_connection_timeout(self, ctimeout_secs):
        self.timeout_secs = ctimeout_secs
        if self.timeout_secs >= 0:
            AsyncDownloader.__connection_timeout = self.timeout_secs

    def set_read_timeout(self, rtimeout_secs):
        self.timeout_secs = rtimeout_secs
        if self.timeout_secs >= 0:
            AsyncDownloader.__read_timeout = self.timeout_secs

    def set_download_count(self, file_count):
        self.file_count = file_count
        if self.file_count > 0:
            AsyncDownloader.__download_count = self.file_count

    def set_worker_count(self, worker_count):
        self.worker_count = worker_count
        if self.worker_count > 0:
            AsyncDownloader.__worker_count = self.worker_count

    def set_chunk_size(self, chunk_size):
        self.chunk_size = chunk_size
        if self.chunk_size > 0:
            AsyncDownloader.__chunk_size = self.chunk_size

    def print_urls(self):
        print(AsyncDownloader.__urls)

    def get_download_time(self):
        return AsyncDownloader.__download_time

    def get_errors(self):
        return AsyncDownloader.__errors

    def download(self):
        start = timer()
        try:
            session = FuturesSession(max_workers=AsyncDownloader.__worker_count)
            session.headers.update({'user-agent': AsyncDownloader.__user_agent})
            session.request(AsyncDownloader.__connection_timeout,
                            AsyncDownloader.__connection_timeout, stream=True)

            results = []
            # Give an accurate file count even if we don't have to download it as it a;ready exist
            file_count = 0

            for url in AsyncDownloader.__urls:
                filename = os.path.basename(url)
                # check if we need only a limited number of files
                if AsyncDownloader.__download_count != 0:
                    # No need to download file if it already exist
                    if pathlib.Path(AsyncDownloader.__dest_path / filename).is_file():
                        file_count += 1
                        continue
                    else:
                        if file_count < AsyncDownloader.__download_count:
                            file_count += 1
                            results.append(session.get(url))
                else:
                    if not pathlib.Path(AsyncDownloader.__dest_path / filename).is_file():
                        results.append(session.get(url))

            for result in results:
                # wait for the response to complete, if it hasn't already
                response = result.result()
                filename = os.path.basename(response.url)
                if response.status_code == 200:
                    with open(pathlib.Path(AsyncDownloader.__dest_path / filename).resolve(), 'wb') as fd:
                        for chunk in response.iter_content(chunk_size=AsyncDownloader.__chunk_size):
                            if chunk:  # filter out keep-alive new chunks
                                fd.write(chunk)

            end = timer()
            AsyncDownloader.__download_time = end - start

        except requests.exceptions.HTTPError as errh:
            AsyncDownloader.__errors.append("Http Error:" + errh)
            # print("Http Error:", errh)
        except requests.exceptions.ConnectionError as errc:
            AsyncDownloader.__errors.append("Error Connecting:" + errc)
            # print("Error Connecting:", errc)
        except requests.exceptions.Timeout as errt:
            AsyncDownloader.__errors.append("Timeout Error:" + errt)
            # print("Timeout Error:", errt)
        except requests.exceptions.RequestException as err:
            AsyncDownloader.__errors.append("OOps: Something Else" + err)
        else:
            return

下面的代码做了一个非常糟糕的假设。事实上,我假设第一个网址将首先完成,当然这是不正确的。

1
2
# wait for the response to complete, if it hasn't already
response = result.result()

如何确保只处理已完成的请求,而不是以有效的方式采取上述假设?

我将不胜感激任何有关如何提高性能的建议。

亲切的问候


即使连接按顺序完成,您仍然按顺序处理文件。第二个文件必须等待第一个文件写入,依此类推。因此,你可以做的最好的事情是并行处理所有内容(尽管有GIL,这可以完成,因为像写入磁盘和从网络读取等操作会释放它)。基本上,使用常规的requests库(不是requests-futures)并为每个请求创建一个future / thread +文件处理。

还有更多的方法可以让它更快,比如在写入时继续下载块(即两个线程,一个用于请求,一个用于文件处理)。并通过发出multi-part请求来并行读取块,这是"下载加速器"的区域,您可能不希望代码中出现这种复杂性。

编辑:此外,分块下载是懒惰的,这意味着您只是并行地进行初始请求,但实际的分块文件下载是按顺序进行的,因为它是在主线程中完成的。因此,您当前的方法并不比完全同步的方法好多少。上述建议仍然有效。


最简单的方法是不需要任何线程或特殊的异步代码:只需使用常规的requests库及其内置的流式选项。您说response = session.get(url, stream=True)然后使用response.iter_content(chunk_size=1024)(例如)一次访问一个块的下载信息。这是一个功能性的例子:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import requests
import os

def stream_multiple(urls):
    responses = {url: requests.get(url, stream=True) for url in urls)
    streams = {url: responses[url].iter_content(chunk_size=1024)
            for url in urls}
    handles = {url: open(os.path.basename(url), 'wb') for url in urls}
    while streams:
        for url in list(streams.keys()):
            try:
                chunk = next(streams[url])
                print("Received {} bytes for {}".format(len(chunk), url))
                handles[url].write(chunk)
            except StopIteration: # no more contenet
                handles[url].close()
                streams.pop(url)

样本输出:

1
2
3
4
5
6
7
8
9
10
11
rat@pandion:~/tmp$ python smu.py
Received 1296 bytes for http://www.gutenberg.org/files/9490/9490-0.txt
Received 1882 bytes for http://www.gutenberg.org/ebooks/21497.txt.utf-8
Received 1524 bytes for http://www.gutenberg.org/files/1729/1729-0.txt
Received 1508 bytes for http://www.gutenberg.org/ebooks/21790.txt.utf-8
Received 1826 bytes for http://www.gutenberg.org/files/9490/9490-0.txt
Received 2349 bytes for http://www.gutenberg.org/ebooks/21497.txt.utf-8
Received 1834 bytes for http://www.gutenberg.org/files/1729/1729-0.txt
Received 1838 bytes for http://www.gutenberg.org/ebooks/21790.txt.utf-8
Received 2009 bytes for http://www.gutenberg.org/files/9490/9490-0.txt
...

使用线程或多处理可能会略微提高性能,但我怀疑这会更好。几乎在所有情况下,将数据写入磁盘的速度远远快于从网络接收数据。


为了处理您的代码,我创建了一个.csv文件,其中包含指向以下顺序的多个网站中的几个robots.txt文件的链接:GitHub,UDemy,YouTube。

调试后,第一个结果

1
response = result.result()

是(按此顺序):UDemy,YouTube,GitHub。
为了记录,每个robots.txt的大小按照我得到结果的顺序增加。
这意味着开始时没有问题,尽管我按特定顺序设置.csv文件,结果按照首次下载文件的顺序排列。

I would appreciate any other suggestion on how to improve performance.

至于性能,您可以通过创建用于将响应写入文件的线程或使用异步IO库(例如Tinche / aiofiles)来提高速度。

如果你想更进一步,可以尝试通过使用Python的替代实现来提高程序本身的性能,例如PyPy


如果你不担心"猴子补丁"你可以使用gevent

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import gevent.monkey
import requests

CONNECTIONS = 10

gevent.monkey.patch_all()  # debug in PyCharm: https://blog.jetbrains.com/pycharm/2012/08/gevent-debug-support/

import gevent.pool


def your_request_without_any_changes(url):
    return requests.get(url)


pool = gevent.pool.Pool(CONNECTIONS)
for response in pool.imap_unordered(your_request_without_any_changes, ['http://www.google.com'] * 100):
    print(response.status_code)

gevent使用"事件循环"和补丁请求库(实际上它发生在较低级别),以便在我们等待响应时切换到另一个任务。