Select模式,类似于php multi curl异步并发,连接数不能太多:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import pycurl
import cStringIO
#最大连接数
num_conn = 20
queue = []
urls = ['https://www.haiyun.me/'] * 10000
for url in urls:
queue.append(url)
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
print ('----- Getting', num_urls, 'Max conn', num_conn,
'connections -----')
m = pycurl.CurlMulti()
#初始化handle,可复用
m.handles = []
for i in range(num_conn):
c = pycurl.Curl()
c.body = cStringIO.StringIO()
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
m.handles.append(c)
freelist = m.handles[:]
num_processed = 0
#主循环开始
while num_processed < num_urls:
#添加请求URL
while queue and freelist:
url = queue.pop()
c = freelist.pop()
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEFUNCTION, c.body.write)
m.add_handle(c)
c.url = url
#print url
#执行请求
while 1:
(ret, num_handles) = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
#阻塞一会直到有连接完成
m.select(1.0)
#读取完成的连接
while 1:
(num_q, ok_list, err_list) = m.info_read()
for c in ok_list:
m.remove_handle(c)
#print c.body.getvalue()
freelist.append(c)
for (c, errno, errmsg) in err_list:
m.remove_handle(c)
print ('Failed: ', c.url, errno, errmsg)
freelist.append(c)
num_processed = num_processed + len(ok_list) + len(err_list)
if num_q == 0:
break
for c in m.handles:
c.fp = None
c.close()
m.close()
epoll模式,php mult curl不支持此模式,tornado基于pycurl multi_socket_action封装的异步http client,每个client实例维护一个ioloop:
from tornado.httpclient import AsyncHTTPClient
from tornado.ioloop import IOLoop
count = 10000
done = 0
def handle_request(response):
global done
done += 1
if (done == count):
#结束循环
IOLoop.instance().stop()
if response.error:
print "Error:", response.error
#else:
#print response.body
#默认client是基于ioloop实现的,配置使用Pycurl
AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient",max_clients=20)
http_client = AsyncHTTPClient()
for i in range(count):
http_client.fetch("https://www.haiyun.me/", handle_request)
#死循环
IOLoop.instance().start()
基于epoll的multi curl在lan环境下效果不如select,因为所有Socket都在活跃状态,所有的callback都被唤醒,会导致资源的竞争。既然都是要处理所有的Socket,直接遍历是最简单最有效的方式.
为更好的性能建议libcurl/pycurl开启异步DNS解析。