[python]Gather all URLs from a certain web
# encoding=utf8 import sys reload(sys) sys.setdefaultencoding('utf8') import urllib from bs4 import BeautifulSoup import urlparse import time import urllib2 url = "http://log.anycle.com" domain = "log.anycle.com" deep = 0 tmp = "" sites = set() visited = set() #local = set() def get_local_pages(url,domain): global deep global sites global tmp repeat_time = 0 pages = set() while True: try: #print "Ready to Open the web!" time.sleep(1) #print "Opening the web", url web = urllib2.urlopen(url=url,timeout=3) #print "Success to Open the web" break except: #print "Open Url Failed !!! Repeat" time.sleep(1) repeat_time = repeat_time+1 if repeat_time == 5: return pages #print "Readint the web ..." soup = BeautifulSoup(web.read()) #print "..." tags = soup.findAll(name='a') for tag in tags: try: ret = tag['href'] except: #print "Maybe not the attr : href" continue o = urlparse.urlparse(ret) #print(o[0]+","+o[1]+","+o[2]+","+o[3]+","+o[4]+";") if o[0] is "" and o[1] is "": #print("Fix Page:"+ret) url_obj = urlparse.urlparse(web.geturl()) #print(url_obj[0]+","+url_obj[1]+","+url_obj[2]+","+ret) #ret = url_obj[0] + "://" + url_obj[1] + url_obj[2] + ret if ret[0] == '/' : ret = url_obj[0] + "://" + url_obj[1] + ret else : ret = url_obj[0] + "://" + url_obj[1] + "/" + ret ret = ret[:8] + ret[8:].replace('//','/') o = urlparse.urlparse(ret) if '../' in o[2]: paths = o[2].split('/') for i in range(len(paths)): if paths[i] == '..': paths[i] = '' if paths[i-1]: paths[i-1] = '' tmp_path = '' for path in paths: if path == '': continue tmp_path = tmp_path + '/' +path ret =ret.replace(o[2],ret_path) #print "FixedPage: " + ret if 'http' not in o[0]: #print("Bad Page:" + ret.encode('ascii')) continue if o[0] is "" and o[1] is not "": #print("Bad Page:"+ret) continue if domain not in o[1]: #print("Bad Page:"+ret) continue if ret.endswith(".pdf"): print("pdf :"+ret) continue if ret.endswith(".exe"): print("pdf :"+ret) continue if ret.endswith(".rar"): print("pdf :"+ret) continue if ret.endswith(".zip"): print("pdf :"+ret) continue if ret.endswith(".png"): print("pdf :"+ret) continue if ret.endswith(".jpg"): print("pdf :"+ret) continue newpage = ret if newpage not in sites: #print("Add New Page:" + newpage) pages.add(newpage) return pages def dfs(pages): if pages is set(): return global url global domain global sites global visited sites = set.union(sites,pages) for page in pages: if page not in visited: print "Visiting",page visited.add(page) url = page pages = get_local_pages(url, domain) dfs(pages) #print "sucess" pages = get_local_pages(url, domain) dfs(pages) print("*****************************\n") print(" dfs over, write to file now\n") fd = open("./data.txt", "wb+") for i in sites: print(i) fd.write(i+"\n") fd.close()
标签: python
« 数组越界fopen时程序奔溃
|
小青柑»
日历
最新微语
- 有的时候,会站在分叉路口,不知道向左还是右
2023-12-26 15:34
- 繁花乱开,鸟雀逐风。心自宁静,纷扰不闻。
2023-03-14 09:56
- 对于不可控的事,我们保持乐观,对于可控的事情,我们保持谨慎。
2023-02-09 11:03
- 小时候,
暑假意味着无忧无虑地玩很长一段时间,
节假意味着好吃好喝还有很多长期不见的小朋友来玩...
长大后,
这是女儿第一个暑假,
一个半月...
2022-07-11 08:54
- Watching the autumn leaves falling as you grow older together
2018-10-25 09:45
分类
最新评论
- Goonog
i get it now :) - 萧
@Fluzak:The web host... - Fluzak
Nice blog here! Also... - Albertarive
In my opinion you co... - ChesterHep
What does it plan? - ChesterHep
No, opposite. - mojoheadz
Everything is OK!... - Josephmaigh
I just want to say t... - ChesterHep
What good topic - AnthonyBub
Certainly, never it ...
发表评论: