关于python：与urllib2或其他http库的多个(异步)连接？

Multiple (asynchronous) connections with urllib2 or other http library?

我有这样的代码。

1
2
3
4
5
6
7
8
9

for p in range(1,1000):
result = False
while result is False:
ret = urllib2.Request('http://server/?'+str(p))
try:
result = process(urllib2.urlopen(ret).read())
except (urllib2.HTTPError, urllib2.URLError):
pass
results.append(result)

我想同时提出两三个请求来加速这个。我可以使用urllib2，以及如何使用？如果不是我应该使用哪个其他库？谢谢。

您可以使用异步IO来执行此操作。

requests + gevent = grequests

GRequests允许您使用带有Gevent的请求来轻松地进行异步HTTP请求。

1
2
3
4
5
6
7
8
9
10
11
12

import grequests

urls = [
'http://www.heroku.com',
'http://tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://kennethreitz.com'
]

rs = (grequests.get(u) for u in urls)
grequests.map(rs)

相关讨论

看一下gevent - 一个基于协程的Python网络库，它使用greenlet在libevent事件循环之上提供高级同步API。

例：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

#!/usr/bin/python
# Copyright (c) 2009 Denis Bilenko. See LICENSE for details.

"""Spawn multiple workers and wait for them to complete"""

urls = ['http://www.google.com', 'http://www.yandex.ru', 'http://www.python.org']

import gevent
from gevent import monkey

# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()

import urllib2

def print_head(url):
print 'Starting %s' % url
data = urllib2.urlopen(url).read()
print '%s: %s bytes: %r' % (url, len(data), data[:50])

jobs = [gevent.spawn(print_head, url) for url in urls]

gevent.joinall(jobs)

那么，它是2016年？我们有Python 3.4+，内置asyncio模块，用于异步I / O.我们可以使用aiohttp作为HTTP客户端并行下载多个URL。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

import asyncio
from aiohttp import ClientSession

async def fetch(url):
async with ClientSession() as session:
async with session.get(url) as response:
return await response.read()

async def run(loop, r):
url ="http://localhost:8080/{}"
tasks = []
for i in range(r):
task = asyncio.ensure_future(fetch(url.format(i)))
tasks.append(task)

responses = await asyncio.gather(*tasks)
# you now have all response bodies in this variable
print(responses)

loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(loop, 4))
loop.run_until_complete(future)

来源：复制粘贴自http://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html

我知道这个问题有点陈旧，但我认为推广构建在请求库上的另一个异步解决方案可能会有用。

1
2
3
4
5

list_of_requests = ['http://moop.com', 'http://doop.com', ...]

from simple_requests import Requests
for response in Requests().swarm(list_of_requests):
print response.content

文档在这里：http：//pythonhosted.org/simple-requests/

也许使用多处理并将你的工作分成两个过程左右。

这是一个例子(它没有经过测试)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

import multiprocessing
import Queue
import urllib2

NUM_PROCESS = 2
NUM_URL = 1000

class DownloadProcess(multiprocessing.Process):
"""Download Process"""

def __init__(self, urls_queue, result_queue):

multiprocessing.Process.__init__(self)

self.urls = urls_queue
self.result = result_queue

def run(self):
while True:

try:
url = self.urls.get_nowait()
except Queue.Empty:
break

ret = urllib2.Request(url)
res = urllib2.urlopen(ret)

try:
result = res.read()
except (urllib2.HTTPError, urllib2.URLError):
pass

self.result.put(result)

def main():

main_url = 'http://server/?%s'

urls_queue = multiprocessing.Queue()
for p in range(1, NUM_URL):
urls_queue.put(main_url % p)

result_queue = multiprocessing.Queue()

for i in range(NUM_PROCESS):
download = DownloadProcess(urls_queue, result_queue)
download.start()

results = []
while result_queue:
result = result_queue.get()
results.append(result)

return results

if __name__ =="__main__":
results = main()

for res in results:
print(res)