作者在 2010-04-20 20:04:16 发布以下内容
HTMLParser版:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import HTMLParser
class UrlParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.urls.append(value)
def geturls(self):
return self.urls
if __name__ == '__main__':
urls = []
url = UrlParser()
url.feed('1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333')
urls += url.geturls()
print urls
# -*- coding: UTF-8 -*-
import HTMLParser
class UrlParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.urls.append(value)
def geturls(self):
return self.urls
if __name__ == '__main__':
urls = []
url = UrlParser()
url.feed('1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333')
urls += url.geturls()
print urls
pyquery版:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from pyquery import PyQuery as pq
class UrlParser():
def __init__(self):
self.urls = []
def feed(self,data):
d = pq(data)
if d.find('a'):
#关于下面一行,我用d('a').attr('href')只能得到第一个URL,暂时只会用map,不知道有没有别的够pythonic的代码
url = d('a').map(lambda i, e: pq(e)('a').attr('href'))
for u in url:
self.urls.append(u)
def geturls(self):
return self.urls
if __name__ == '__main__':
urls = []
url = UrlParser()
url.feed('1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333')
urls += url.geturls()
print urls
# -*- coding: UTF-8 -*-
from pyquery import PyQuery as pq
class UrlParser():
def __init__(self):
self.urls = []
def feed(self,data):
d = pq(data)
if d.find('a'):
#关于下面一行,我用d('a').attr('href')只能得到第一个URL,暂时只会用map,不知道有没有别的够pythonic的代码
url = d('a').map(lambda i, e: pq(e)('a').attr('href'))
for u in url:
self.urls.append(u)
def geturls(self):
return self.urls
if __name__ == '__main__':
urls = []
url = UrlParser()
url.feed('1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333')
urls += url.geturls()
print urls
正则表达式版:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
class UrlParser():
def __init__(self):
self.urls = []
def feed(self,data):
url = re.findall(r'''<a(\s*)(.*?)(\s*)href(\s*)=(\s*)([\"\s]*)([^\"\']+?)([\"\s]+)(.*?)>''',data,re.S|re.I)
for u in url:
self.urls.append(u[6])
def geturls(self):
return self.urls
if __name__ == '__main__':
urls = []
url = UrlParser()
url.feed('1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333')
urls += url.geturls()
print urls
# -*- coding: UTF-8 -*-
import re
class UrlParser():
def __init__(self):
self.urls = []
def feed(self,data):
url = re.findall(r'''<a(\s*)(.*?)(\s*)href(\s*)=(\s*)([\"\s]*)([^\"\']+?)([\"\s]+)(.*?)>''',data,re.S|re.I)
for u in url:
self.urls.append(u[6])
def geturls(self):
return self.urls
if __name__ == '__main__':
urls = []
url = UrlParser()
url.feed('1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333')
urls += url.geturls()
print urls
速度比较:正则表达式 > pyquery > HTMLParser
测试的时候遍历大约1000个页面,正则表达式占绝对优势,这3个速度比例大约是 8:2:1
HTMLParser最慢,pyquery速度大约是它的2倍,正则的速度是它的8倍,看来以后如非必要不再考虑HTMLParser了,用起来也不如pyquery方便,正则速度倒是很快,功能也强大,前两者能提取的内容用正则全部都能实现,而正则能实现的功能前两者就不一定能实现了。只是正则的可读性不好。以后遇到数据量大的用正则表达式,数据量不大不考虑时间因素但逻辑复杂的的用pyquery,以后维护起来方便