PycURL

CC++C#PythonGmail
http://wiki.woodpecker.org.cn/moin/zspy

代码见 http://zspy.googlecode.com

张沈鹏 zsp007@gmail.com http://zsp.iteye.com/

2008-1-23 16:42

1. PycURL

Pycurl http://pycurl.sourceforge.net/

外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.

从 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.

参考文献1,测试代码

Toggle line numbers

1

2 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些

3 import StringIO

4

5 html = StringIO.StringIO()

6

7 import pycurl

8 c = pycurl.Curl()

9

10 c.setopt(pycurl.URL, 'http://www.baidu.com')

11

12 #写的回调

13 c.setopt(pycurl.WRITEFUNCTION, html.write)

14

15 c.setopt(pycurl.FOLLOWLOCATION, 1)

16

17 #最大重定向次数,可以预防重定向陷阱

18 c.setopt(pycurl.MAXREDIRS, 5)

19

20 #访问,阻塞到访问结束

21 c.perform()

22

23 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)

24 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)

25

26 #输出百度首页的html

27 #print html.getvalue()

然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html

我自己改写了一个:)

Toggle line numbers

1

2 #!/usr/bin/env python

3 #coding=utf-8

4

5 import threading

6 import pycurl

7 from cStringIO import StringIO

8

9 class UrlOpen(threading.Thread):

10 """异步下载网页"""

11

12 def __init__(self):

13 super(UrlOpen,self).__init__()

14 self.opener = pycurl.CurlMulti()

15 self.handle_list=[]

16

17 def add(self,url,recall,writer=StringIO()):

18 """

19 参数:网址,回调函数,存放临时数据的对象

20 """

21 c = pycurl.Curl()

22

23 #可以传给回调函数

24 c.url=url

25 c.content = writer

26 c.recall = recall

27 c.setopt(c.URL,url)

28 c.setopt(c.WRITEFUNCTION,c.content.write)

29

30 self.handle_list.append(c)

31 self.opener.add_handle(c)

32

33 def _remove(self,c):

34 c.close()

35 self.opener.remove_handle(c)

36 self.handle_list.remove(c)

37

38

39 def run(self):

40 num_handle=len(self.handle_list)

41 while 1:

42 ret = self.opener.select(10.0)

43 if ret == -1: continue

44 while 1:

45 num_handle_pre=num_handle

46 ret, num_handle =self.opener.perform()

47 #活动的连接数改变时

48 if num_handle!=num_handle_pre:

49 result=self.opener.info_read()

50 print result

51 for i in result[1]:

52 #成功

53 i.http_code = i.getinfo(i.HTTP_CODE)

54 self._remove(i)

55 i.recall(i)

56 for i in result[2]:

57 #失败,应该记录一下

58 self._remove(i)

59

60 if ret != pycurl.E_CALL_MULTI_PERFORM:

61 break

62

63 _opener=None

64 def urlopen(*arg,**key):

65 global _opener

66 if _opener is None:

67 _opener=UrlOpen()

68 _opener.add(*arg,**key)

69 _opener.start()

70 else:

71 _opener.add(*arg,**key)

72

73 def show(x):

74 print x.content.getvalue()

75 if __name__=="__main__":

76 urlopen("http://www.baidu.com/",show)

77 _opener.join()

又封装了一个异步打开网页的类和函数

Toggle line numbers

1 #coding=utf-8

2

3 import threading

4 from cStringIO import StringIO

5

6 import pycurl

7 """

8 Asyn open url

9 Author:zsp007@gmail.com

10 2008-1-25 17:14

11 """

12

13 class UrlOpen(threading.Thread):

14 """异步下载网页"""

15

16 def __init__(self,):

17 super(UrlOpen,self).__init__()

18 self.opener = pycurl.CurlMulti()

19 self.handle_list=[]

20 self.waiting=[]

21

22 def add(self,url,recall,catch=None,writer=StringIO()):

23 """

24 参数:网址,回调函数,存放临时数据的对象

25 """

26 if catch is None:

27 def catch(curl,error_no,desp):

28 #print "Error:%s - %s"%(error_no,desp)

29 pass

30

31 c = pycurl.Curl()

32

33 #可以传给回调函数

34 c.url=url

35 c.content = writer

36 c.recall = recall

37 c.catch=catch

38 c.setopt(c.URL,

39 url.encode('utf-8') if type(url) is unicode else url

40 )

41 c.setopt(c.WRITEFUNCTION,c.content.write)

42

43 self.waiting.append(c)

44

45 def _add(self):

46 waiting=self.waiting[:]

47 self.waiting=[]

48 for c in waiting:

49 self.handle_list.append(c)

50 self.opener.add_handle(c)

51

52 def _remove(self,c):

53 c.close()

54 self.opener.remove_handle(c)

55 self.handle_list.remove(c)

56

57

58 def run(self):

59 import select

60 import time

61 num_handle=0

62 while 1:

63 if self.handle_list:

64 ret = self.opener.select(1.0)

65 if ret >= 0:

66 while 1:

67 num_handle_pre=num_handle

68 ret, num_handle =self.opener.perform()

69 #活动的连接数改变时

70 if num_handle!=num_handle_pre:

71 result=self.opener.info_read()

72 for i in result[1]:

73 #成功

74 i.http_code = i.getinfo(i.HTTP_CODE)

75 self._remove(i)

76 i.recall(i)

77 for i in result[2]:

78 #失败,应该记录一下,或回调失败函数

79 #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')

80 i[0].catch(*i)

81 self._remove(i[0])

82 if ret != pycurl.E_CALL_MULTI_PERFORM:

83 break

84 else:

85 time.sleep(1)

86 self._add()

87

88 _opener=None

89 def urlopen(*arg,**key):

90 global _opener

91 if _opener is None:

92 _opener=UrlOpen()

93 _opener.start()

94 _opener.add(*arg,**key)

95

96 if __name__=="__main__":

97 def show(x):

98 print x.content.getvalue()

99 print '--'*11

100 urlopen("http://www.baidu.com/",show)

101 urlopen("http://www.google.com/",show)

102 urlopen("http://www.sougou.com/",show)

103 urlopen("http://www.yodao.com/",show)

104 urlopen("http://www.yahoo.com/",show)

105 urlopen("http://www.msn.com/",show)

106 _opener.join()

1.1. 相关文献

PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx

python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337