用python写个爬虫玩玩
在网上搜到几篇python爬虫的文章,很多都没有根据链接数目来决定优先级,对相对地址的URL也是硬编码的多,因此就自己写个爬虫来玩玩。
这个python爬虫代码的特点有:
ü 按优先级高低顺序抓取网页。
ü 使用默认的类似PageRank优先级,越多网页链接到的页面优先级越高,允许自定义优先级,如优先抓取以.html结尾的URL,或含有指定网址的URL。
ü 自动识别相对地址的URL,避免硬编码
本爬虫包含两个文件:
v Myspider.py 实现爬虫主要逻辑的代码
v spdUtility.py 两个Utility辅助类,优先级队列类和HTML解释类
# -*- coding: utf-8 -*- #author:Keengle(http://www.kgblog.net) from spdUtility import PriorityQueue,Parser import urllib2 import sys import os
def updatePriQueue( priQueue, url ): "更新优先级队列" extraPrior = url.endswith('.html') and 2 or 0 #这里优先下载以html结尾的url extraMyBlog = 'www.kgblog.net' in url and 5 or 0 #优先抓取含有指定内容的网页,竞价抓取排名?? item = priQueue.getitem(url) if item : newitem = ( item[0]+1+extraPrior+extraMyBlog, item[1] ) priQueue.remove(item) priQueue.push( newitem ) else : priQueue.push( (1+extraPrior+extraMyBlog,url) )
def getmainurl(url): "获得该url的主站地址,用于添加在相对url地址的开头" ix = url.find('/',len('http://') ) if ix > 0 : return url[:ix] else : return url
def analyseHtml(url,html, priQueue,downlist): "分析html的超链接,并更新优先级队列" p = Parser() try : p.feed(html) p.close() except: return mainurl = getmainurl(url) for k, v in p.anchors.items(): for u in v : if not u.startswith('http://'): #处理相对地址的url u = mainurl + u if not downlist.count(u) : #如果该url已经下载,就不处理了 updatePriQueue( priQueue, u )
def downloadUrl(id, url, priQueue , downlist,downFolder): "下载指定url内容,并分析html超链接" downFileName = downFolder+'/%d.html' % (id,) print 'downloading',url,'as', downFileName , try: fp = urllib2.urlopen(url) except: print '[ failed ]' return False else : print '[ success ]' downlist.push( url ) #把已下载的url添加到列表中 op = open(downFileName,"wb") html = fp.read() op.write( html ) op.close() fp.close()
analyseHtml(url,html,priQueue,downlist) return True
def spider(beginurl, pages,downFolder): "爬虫主程序,循环从优先级队列中取出最高优先级的结点处理" priQueue = PriorityQueue() downlist = PriorityQueue() #已下载url的集合,防止重复下载 priQueue.push( (1,beginurl) ) i = 0 while not priQueue.empty() and i < pages : k, url = priQueue.pop() if downloadUrl(i+1, url, priQueue , downlist,downFolder): i += 1 print '\nDownload',i,'pages, Totally.'
def main(): "主函数,设定相关参数:开始url,抓取的网页数目,保存的文件夹" beginurl = 'http://www.kgblog.net' #开始抓取的URL地址 pages = 20 #抓取网页的数目 downloadFolder = './spiderDown' #指定保存网页的文件夹 if not os.path.isdir( downloadFolder ): os.mkdir( downloadFolder ) spider( beginurl, pages, downloadFolder)
if __name__ == '__main__': main()
|
#filename:spdUtility.py # -*- coding: utf-8 -*- import bisect import string import htmllib import formatter class PriorityQueue(list): "优先级队列,用于存储url,及它的优先级" def __init__(self): list.__init__(self) self.map = {}
def push(self, item): # 按顺序插入,防止重复元素;若要按升序排列,可使用bisect.insort_left if self.count(item) == 0: bisect.insort(self, item) self.map[ item[1] ] = item
def pop(self): r = list.pop(self) del self.map[ r[1] ] return r
def getitem(self,url): if self.map.has_key( url ): return self.map[url] else : return None
def empty(self): return len(self) == 0
def remove(self,item): list.remove(self, item) del self.map[ item[1] ]
def count(self,item): if len(self) == 0 : return 0 #二分查找 left = 0 right = len(self)-1 mid = -1 while left <= right: mid = (left+right)/2 if self[mid] < item : left = mid + 1 elif self[mid] > item : right = mid -1 else : break return self[mid] == item and 1 or 0
class Parser(htmllib.HTMLParser): # HTML分析类 def __init__(self, verbose=0): self.anchors = {} f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, f, verbose)
def anchor_bgn(self, href, name, type): self.save_bgn() self.anchor = href
def anchor_end(self): text = string.strip(self.save_end()) if self.anchor and text: self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
def main(): #just for test pq = PriorityQueue() # add items out of order pq.push( (1,'http://www.baidu.com') ) pq.push( (2,'http://www.sina.com') ) pq.push( (3,'http://www.google.com') ) pq.push( (1,'http://www.163.com') )
item = pq.getitem('http://www.sina.com') print item print pq.count(item) pq.remove( item ) print pq.count(item) # print queue contents while not pq.empty(): print pq.pop()
if __name__ == '__main__': main() |
注:转载请注明出处http://www.kgblog.net 。
你可以在:
print '[ success ]'
downlist.push( url ) #把已下载的url添加到列表中
op = open(downFileName,"wb")
html = fp.read()
op.write( html )
op.close()
fp.close()
在write之前,要html = html.decode('gb2312').encode('utf-8')
html = html.decode('gb2312').encode('utf-8')
按照我实际在运作的网站应该改为
html = html.decode('gb18030').encode('utf-8')
邮箱123fylong@163.com.谢谢!