python扫目录程序

来源:岁月联盟 编辑:exp 时间:2012-02-24
by:90sec maxs98 申请论坛邀请码的帖子,现在补上。
 
写了2个版本的,先贴单线程的,程序用HTTP头判断页面是否存在速度较快。使用时需要指定字典文件,要在程序里改改。(相信你懂的)
 
#usr/bin/python
#encoding=utf-8
import sys
import httplib
import re
import time
 
 
def Usage(): 
    print 'Usage: python scan.py' 
    sys.exit()
 
if len(sys.argv)!=2: 
    Usage()
 
start = time.time()
target = sys.argv[1]
port = 80
dict_path = "/media/sf_TDDOWNLOAD/dict.txt"
f = file(dict_path)
while True:
    line = f.readline()
    line = re.split('//r',line,2)
    path = line[0].decode("gbk").encode("utf-8")
    #print line
    conn = httplib.HTTPConnection(target,80) 
    #conn.set_debuglevel(2) 
    conn.request('GET',path,headers = {"Host": target,"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5","Accept": "text/plain"}) 
 
    ret = conn.getresponse().status
    if ret==200 or ret==500 or ret==403 or ret==301:
        print target+path+'  found! status:', ret
    else:
        print target+path+"  not found!"
    if len(line)==0:
        print "done..."
        break
f.close()
print "Elapsed Time: %s" % (time.time() - start)
 
==========================================
下面是www.2cto.com多线程版本,使用了一个消息队列来处理要扫描的路径。注意线程不要开的太多。不然会出莫名其妙的错误。
#!/usr/bin/env python
import Queue
import threading
import httplib
import time
import re
 
 
queue = Queue.Queue()
 
class ThreadUrl(threading.Thread):
    """Threaded Url Grab"""
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue
 
    def run(self):
        while True:
            #
            path = self.queue.get()
            target = "www.xjbaihe.com"
            port = 80
           
            conn = httplib.HTTPConnection(target,80)
            conn.request('GET',path,headers = {"Host": target,"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5","Accept": "text/plain"})
            ret = conn.getresponse().status
            if ret==200 or ret==500 or ret==403 or ret==301:
                print path+'  found! status:', ret
            else:
                print path+"  not found!"
           
 
            #signals to queue job is done
            self.queue.task_done()
 
start = time.time()
def main():
   
    #spawn a pool of threads, and pass them queue instance
    for i in range(5):
        t = ThreadUrl(queue)
        t.setDaemon(True)
        t.start()
 
    #populate queue with data
    print "reading dict..."
    dict_path = "/media/sf_TDDOWNLOAD/dictest.txt"
    f = file(dict_path)
    while True:
        line = f.readline()
        line = re.split('//r',line,2)
        path = line[0].decode("gbk").encode("utf-8")
        queue.put(path)
        if len(path)==0:
            print "done..."
            break
    f.close()
   
    #wait on the queue until everything has been processed
    queue.join()
if __name__ == '__main__':
    main()
    print "Elapsed Time: %s" % (time.time() - start)
 
=================
 

图片内容